Exemple #1
0
 def test_normalize_tags_on_work(self, input, output, Graph):
     graph = ChangeGraph(Graph(CreativeWork(tags=input)))
     graph.normalize()
     graph.prune()
     assert [
         n.serialize()
         for n in sorted(graph.nodes, key=lambda x: x.type + str(x.id))
     ] == Graph(CreativeWork(tags=output))
Exemple #2
0
class TestModelNormalization:

    # test each tag resolves to lowercased, tokenized name
    @pytest.mark.parametrize('input, output', [(i, o) for input, o in [
        ([
            Tag(name=''),
            Tag(name='        '),
            Tag(name='\n\n\n'),
        ], []),
        ([
            Tag(name='foo'),
            Tag(name='foO'),
            Tag(name='Foo'),
            Tag(name='FOO'),
            Tag(name='      FOO'),
            Tag(name='      foo\n\n\n'),
        ], [Tag(name='foo')]),
        ([
            Tag(name='Rocket League'),
            Tag(name='rocket league'),
            Tag(name='ROCKET LEAGUE'),
            Tag(name='Rocket         League'),
            Tag(name='\nRocket    \n     League\t'),
            Tag(name='rocket\nleague'),
        ], [Tag(name='rocket league')]),
        ([
            Tag(name='Crash; Bandicoot'),
            Tag(name='Crash;           Bandicoot'),
            Tag(name='\nCrash; Bandicoot'),
            Tag(name='crash, bandicoot'),
            Tag(name='Crash ,Bandicoot           '),
        ], [Tag(name='bandicoot'), Tag(name='crash')]),
    ] for i in input])
    def test_normalize_tag(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(tags=[input])))
        graph.process(disambiguate=False)

        assert graph.serialize() == Graph(CreativeWork(tags=output))

    # test tags with the same name are merged on a work
    @pytest.mark.parametrize('input, output', [
        ([
            Tag(name=''),
            Tag(name='        '),
            Tag(name='\n\n\n'),
        ], []),
        ([
            Tag(name='foo'),
            Tag(name='foO'),
            Tag(name='Foo'),
            Tag(name='FOO'),
            Tag(name='      FOO'),
            Tag(name='      foo\n\n\n'),
        ], [Tag(name='foo')]),
        ([
            Tag(name='Rocket League'),
            Tag(name='rocket league'),
            Tag(name='ROCKET LEAGUE'),
            Tag(name='Rocket         League'),
            Tag(name='\nRocket    \n     League\t'),
            Tag(name='rocket\nleague'),
        ], [Tag(name='rocket league')]),
        ([
            Tag(name='Crash; Bandicoot'),
            Tag(name='Crash;           Bandicoot'),
            Tag(name='\nCrash; Bandicoot'),
            Tag(name='crash, bandicoot'),
            Tag(name='Crash ,Bandicoot           '),
        ], [Tag(name='bandicoot'), Tag(name='crash')]),
    ])
    def test_normalize_tags_on_work(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(tags=input)))
        graph.normalize()
        graph.prune()
        assert [
            n.serialize()
            for n in sorted(graph.nodes, key=lambda x: x.type + str(x.id))
        ] == Graph(CreativeWork(tags=output))

    @pytest.mark.parametrize(
        'input, output',
        [(i, o)
         for input, o in [([
             Person(name='Smith, J'),
             Person(name='J    Smith   '),
             Person(name='Smith,     J'),
             Person(given_name='J', family_name='Smith'),
             Person(given_name='  J', family_name='\n\nSmith'),
         ], Person(name='J Smith', family_name='Smith', given_name='J')),
                          ([
                              Person(name='Johnathan James Doe'),
                              Person(name='johnathan james doe'),
                          ],
                           Person(name='Johnathan James Doe',
                                  family_name='Doe',
                                  given_name='Johnathan',
                                  additional_name='James')),
                          ([
                              Person(name='johnathan james doe JR'),
                          ],
                           Person(name='Johnathan James Doe Jr',
                                  family_name='Doe',
                                  given_name='Johnathan',
                                  additional_name='James',
                                  suffix='Jr')),
                          ([
                              Person(name='none'),
                              Person(name=''),
                              Person(name='NULL'),
                              Person(name='None'),
                              Person(name='           '),
                              Person(name='     None      '),
                          ], None)] for i in input])
    def test_normalize_person(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == (Graph(output) if output else [])

    # test two people with the same identifier are merged
    # sort by length and then alphabetize name field
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifier
            ([
                Person(0, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(1, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(2, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
            ], [
                Person(2,
                       name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1)])
            ]),
            ([
                Person(0, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(1, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(2, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(3, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
            ], [
                Person(3,
                       name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1)])
            ]),
            # same name, different identifiers
            ([
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(2)])
            ], [
                Person(name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1)]),
                Person(name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(2)])
            ]),
            # no name - name, same identifier
            ([
                Person(name='', identifiers=[AgentIdentifier(1)]),
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)])
            ], [
                Person(name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1)])
            ]),
            # two names, same identifier, take longer name
            ([
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(name='Barbra Dylan', identifiers=[AgentIdentifier(1)])
            ], [
                Person(name='Barbra Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1)])
            ]),
            # two sames, same length, same identifier, alphabetize and take first
            ([
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(name='Aarb Dylan', identifiers=[AgentIdentifier(1)])
            ], [
                Person(name='Aarb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1)])
            ]),
            # 3 different names, take longest of each name field
            (
                [
                    # Below case WILL FAIL. Haven't seen just a last name... yet
                    # Person(name='Dylan', identifiers=[AgentIdentifier(1)]),
                    Person(name='Dylan, B', identifiers=[AgentIdentifier(1)]),
                    Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)
                                                           ]),
                    Person(name='B. D. Dylan',
                           identifiers=[AgentIdentifier(1)])
                ],
                [
                    Person(name='Barb D. Dylan',
                           parse=True,
                           identifiers=[AgentIdentifier(1)])
                ]),
        ])
    def test_normalize_person_relation(self, input, output, Graph):
        graph = ChangeGraph(Graph(*input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(*output)

    @pytest.mark.parametrize('input, output', [
        (Agent(name='none'), None),
        (Agent(name=''), None),
        (Agent(name='NULL'), None),
        (Agent(name='None'), None),
        (Agent(name='           '), None),
        (Agent(name='     None      '), None),
        (Agent(name='     Empty Foundation      '),
         Organization(name='Empty Foundation')),
        (Agent(name='University \n of Arizona '),
         Institution(name='University of Arizona')),
        (Agent(name='NMRC, University College, Cork, Ireland'),
         Institution(name='NMRC, University College',
                     location='Cork, Ireland')),
        (Agent(name='Ioffe Physico-Technical Institute'),
         Institution(name='Ioffe Physico-Technical Institute')),
        (Agent(name='DPTA'), Organization(name='DPTA')),
        (Agent(
            name=
            'B. Verkin Institute for Low Temperatures Physics & Engineering, Kharkov, Ukraine'
        ),
         Institution(
             name=
             'B. Verkin Institute for Low Temperatures Physics & Engineering',
             location='Kharkov, Ukraine',
             type='institution')),
        (Agent(name='Physikalisches Institut, University Wuerzburg, Germany'),
         Agent(name='Physikalisches Institut',
               location='University Wuerzburg, Germany',
               type='institution')),
        (Agent(
            name=
            'Centro de Biotecnologia e Departamento de Biofísica; UFRGS; Av Bento Goncalves 9500, Predio 43431 sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi'
        ),
         Agent(
             name='UFRGS - Centro de Biotecnologia e Departamento de Biofísica',
             location=
             'Av Bento Goncalves 9500, Predio 43431 Sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi'
         )),
        (Agent(
            name=
            'Department of Chemistry; ZheJiang University; HangZhou ZheJiang CHINA'
        ),
         Institution(name='ZheJiang University - Department of Chemistry',
                     location='HangZhou ZheJiang CHINA')),
        (Agent(
            name=
            'Marine Evolution and Conservation; Groningen Institute for Evolutionary Life Sciences; University of Groningen; Nijenborgh 7, 9747 AG Groningen The Netherlands'
        ),
         Institution(
             name=
             'University of Groningen - Marine Evolution and Conservation; Groningen Institute for Evolutionary Life Sciences',
             location='Nijenborgh 7, 9747 AG Groningen The Netherlands')),
        (Agent(
            name=
            'Institute of Marine Research; PO Box 1870 Nordnes, 5817 Bergen Norway'
        ),
         Institution(name='Institute of Marine Research',
                     location='PO Box 1870 Nordnes, 5817 Bergen Norway')),
        (Agent(name='    PeerJ    Inc.    '), Organization(name='PeerJ Inc.')),
        (Agent(name=' Clinton   Foundation\n   '),
         Organization(name='Clinton Foundation')),
    ])
    def test_normalize_agent(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == (Graph(output) if output else [])

    # test two organizations/institutions with the same name are merged
    # sort by length and then alphabetize name field
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifiers
            ([
                Organization(name='American Heart Association',
                             identifiers=[AgentIdentifier(1)]),
                Organization(name='American Heart Association',
                             identifiers=[AgentIdentifier(1)])
            ], [
                Organization(name='American Heart Association',
                             identifiers=[AgentIdentifier(1)])
            ]),
            # same name, different identifiers
            ([
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(1)]),
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(2)])
            ], [
                Organization(
                    name='Money Foundation',
                    identifiers=[AgentIdentifier(1),
                                 AgentIdentifier(2)])
            ]),
            # same name, different identifiers, different capitilization
            ([
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(1)]),
                Organization(name='MONEY FOUNDATION',
                             identifiers=[AgentIdentifier(2)])
            ], [
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(1)]),
                Organization(name='MONEY FOUNDATION',
                             identifiers=[AgentIdentifier(2)])
            ]),
            # same identifier, different type, accept more specific type
            ([
                Institution(name='University of Virginia',
                            identifiers=[AgentIdentifier(1)]),
                Organization(name='University of Virginia',
                             identifiers=[AgentIdentifier(1)]),
            ], [
                Institution(name='University of Virginia',
                            identifiers=[AgentIdentifier(1)])
            ]),
            # same identifier, same name, same length, different capitilization, alphabetize
            ([
                Organization(name='Share', identifiers=[AgentIdentifier(1)]),
                Organization(name='SHARE', identifiers=[AgentIdentifier(1)])
            ], [Organization(name='SHARE', identifiers=[AgentIdentifier(1)])]),
            # same name, one identifier, add identifier
            ([
                Organization(name='Timetables Inc.'),
                Organization(name='Timetables Inc.',
                             identifiers=[AgentIdentifier(1)])
            ], [
                Organization(name='Timetables Inc.',
                             identifiers=[AgentIdentifier(1)])
            ]),
            # same identifier, different name, accept longest alphabetize
            ([
                Institution(name='Cooking Institute',
                            identifiers=[AgentIdentifier(1)]),
                Institution(name='Cooking Instituze',
                            identifiers=[AgentIdentifier(1)]),
                Institution(name='Cook Institute',
                            identifiers=[AgentIdentifier(1)])
            ], [
                Institution(name='Cooking Institute',
                            identifiers=[AgentIdentifier(1)])
            ]),
        ])
    def test_normalize_organization_institution_name(self, input, output,
                                                     Graph):
        graph = ChangeGraph(Graph(*input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(*output)

    # test different types of agent work relations
    # Funder, Publisher, Host
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifiers
            ([
                Funder(cited_as='American Heart Association',
                       agent=Organization(
                           1,
                           id=1,
                           name='American Heart Association',
                           identifiers=[AgentIdentifier(1, id=0)])),
                Host(cited_as='American Heart Association',
                     agent=Organization(2,
                                        id=1,
                                        name='American Heart Association',
                                        identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='American Heart Association',
                       agent=Organization(
                           1,
                           id=1,
                           name='American Heart Association',
                           identifiers=[AgentIdentifier(1, id=0)])),
                Host(cited_as='American Heart Association',
                     agent=Organization(1,
                                        id=1,
                                        name='American Heart Association',
                                        identifiers=[AgentIdentifier(1, id=0)
                                                     ]))
            ]),
            # same name, different identifiers
            ([
                Funder(cited_as='Money Foundation',
                       agent=Organization(id=1,
                                          name='Money Foundation',
                                          identifiers=[AgentIdentifier(1)])),
                Host(cited_as='Money Foundation',
                     agent=Organization(name='Money Foundation',
                                        identifiers=[AgentIdentifier(2)]))
            ], [
                Funder(cited_as='Money Foundation',
                       agent=Organization(id=1,
                                          name='Money Foundation',
                                          identifiers=[
                                              AgentIdentifier(1),
                                              AgentIdentifier(2)
                                          ])),
                Host(cited_as='Money Foundation',
                     agent=Organization(id=1, name='Money Foundation'))
            ]),
            # same identifier, different type
            ([
                Funder(cited_as='University of Virginia',
                       agent=Institution(id=1,
                                         name='University of Virginia',
                                         identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='University of Virginia',
                          agent=Institution(
                              name='University of Virginia',
                              identifiers=[AgentIdentifier(1, id=0)]))
            ], [
                Funder(cited_as='University of Virginia',
                       agent=Institution(
                           id=1,
                           name='University of Virginia',
                           identifiers=[AgentIdentifier(1, id=0)])),
                Publisher(cited_as='University of Virginia',
                          agent=Institution(id=1,
                                            name='University of Virginia'))
            ]),
            # same identifier, same name, same length, different capitilization, alphabetize
            ([
                Publisher(cited_as='Share',
                          agent=Organization(
                              id=0,
                              name='Share',
                              identifiers=[AgentIdentifier(1, id=2)])),
                Host(cited_as='SHARE',
                     agent=Organization(id=1,
                                        name='SHARE',
                                        identifiers=[AgentIdentifier(1, id=3)
                                                     ]))
            ], [
                Publisher(cited_as='Share',
                          agent=Organization(
                              id=1,
                              name='SHARE',
                              identifiers=[AgentIdentifier(1, id=3)])),
                Host(cited_as='SHARE', agent=Organization(id=1, name='SHARE'))
            ]),
            # same name, one identifier, add identifier
            ([
                Funder(cited_as='Timetables Inc.',
                       agent=Organization(id=1, name='Timetables Inc.')),
                Publisher(cited_as='Timetables Inc.',
                          agent=Organization(id=2,
                                             name='Timetables Inc.',
                                             identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='Timetables Inc.',
                       agent=Organization(id=2,
                                          name='Timetables Inc.',
                                          identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='Timetables Inc.', agent=Organization(id=2))
            ]),
            # same identifier, different name, accept longest alphabetize
            ([
                Funder(cited_as='Cooking Institute',
                       agent=Organization(id=1,
                                          name='Cooking Notaninstitute',
                                          identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Organization(id=2,
                                             name='Cooking Notaninstituze',
                                             identifiers=[AgentIdentifier(1)
                                                          ])),
                Host(cited_as='Cook Institute',
                     agent=Organization(id=3,
                                        name='Cook Notaninstitute',
                                        identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='Cooking Institute',
                       agent=Organization(id=3,
                                          name='Cooking Notaninstitute',
                                          identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Organization(id=3)),
                Host(cited_as='Cook Institute', agent=Organization(id=3))
            ]),
            # same identifier, different name, different type, accept longest alphabetize, more specific
            ([
                Funder(cited_as='Cooking Institute',
                       agent=Institution(id=1,
                                         name='Cooking Notaninstitute',
                                         identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Organization(id=2,
                                             name='Cooking Notaninstituze',
                                             identifiers=[AgentIdentifier(1)
                                                          ])),
                Host(cited_as='Cook Institute',
                     agent=Institution(id=3,
                                       name='Cook Notaninstitute',
                                       identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='Cooking Institute',
                       agent=Institution(id=3,
                                         name='Cooking Notaninstitute',
                                         identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Institution(id=3)),
                Host(cited_as='Cook Institute', agent=Institution(id=3))
            ]),
        ])
    def test_normalize_mixed_agent_relation(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(agent_relations=input)))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(CreativeWork(agent_relations=output))

    # test different types of agent work relations
    # Contributor, Creator
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifiers, different type, same type tree, organization
            ([
                Creator(cited_as='American Heart Association',
                        agent=Organization(
                            id=0,
                            name='American Heart Association',
                            identifiers=[AgentIdentifier(1, id=1)])),
                Contributor(cited_as='American Heart Association',
                            agent=Organization(
                                id=1,
                                name='American Heart Association',
                                identifiers=[AgentIdentifier(1, id=2)]))
            ], [
                Creator(cited_as='American Heart Association',
                        agent=Organization(
                            id=1,
                            name='American Heart Association',
                            identifiers=[AgentIdentifier(1, id=2)]))
            ]),
            # same name, different identifiers, different type, same type tree
            ([
                Creator(cited_as='Money Foundation',
                        agent=Organization(id=1,
                                           name='Money Foundation',
                                           identifiers=[AgentIdentifier()])),
                Contributor(cited_as='Money Foundation',
                            agent=Organization(id=2,
                                               name='Money Foundation',
                                               identifiers=[AgentIdentifier()
                                                            ])),
            ], [
                Creator(cited_as='Money Foundation',
                        agent=Organization(
                            id=2,
                            name='Money Foundation',
                            identifiers=[AgentIdentifier(),
                                         AgentIdentifier()]))
            ]),
            # same identifier, same name, different type
            ([
                Contributor(cited_as='University of Virginia',
                            agent=Institution(
                                id=0,
                                name='University of Virginia',
                                identifiers=[AgentIdentifier(1, id=1)])),
                Publisher(cited_as='University of Virginia',
                          agent=Institution(
                              id=1,
                              name='University of Virginia',
                              identifiers=[AgentIdentifier(1, id=1)]))
            ], [
                Contributor(cited_as='University of Virginia',
                            agent=Institution(
                                id=1,
                                name='University of Virginia',
                                identifiers=[AgentIdentifier(1, id=1)])),
                Publisher(cited_as='University of Virginia',
                          agent=Institution(
                              id=1,
                              name='University of Virginia',
                              identifiers=[AgentIdentifier(1, id=1)]))
            ]),
            # same identifier, same name, different type, same type tree, person
            ([
                Creator(cited_as='Bob Dylan',
                        agent=Person(id=0,
                                     name='Bob Dylan',
                                     identifiers=[AgentIdentifier(1, id=0)])),
                Contributor(cited_as='Bob Dylan',
                            agent=Person(
                                id=1,
                                name='Bob Dylan',
                                identifiers=[AgentIdentifier(1, id=1)])),
            ], [
                Creator(cited_as='Bob Dylan',
                        agent=Person(id=1,
                                     name='Bob Dylan',
                                     given_name='Bob',
                                     family_name='Dylan',
                                     identifiers=[AgentIdentifier(1, id=1)]))
            ]),
            # same identifier, different name, different type
            ([
                Creator(cited_as='B. Dylan',
                        agent=Person(id=0,
                                     name='B. Dylan',
                                     identifiers=[AgentIdentifier(1, id=0)])),
                Contributor(cited_as='Bob Dylan',
                            agent=Person(
                                id=1,
                                name='Bob Dylan',
                                identifiers=[AgentIdentifier(1, id=1)])),
            ], [
                Creator(cited_as='Bob Dylan',
                        agent=Person(id=1,
                                     name='Bob Dylan',
                                     given_name='Bob',
                                     family_name='Dylan',
                                     identifiers=[AgentIdentifier(1, id=1)]))
            ]),
            # same name, one identifier, add identifier
            ([
                Creator(1,
                        id=0,
                        order_cited=4,
                        cited_as='Timetables Inc.',
                        agent=Organization(id=0, name='Timetables Inc.')),
                Creator(1,
                        id=1,
                        order_cited=20,
                        cited_as='Timetables Inc.',
                        agent=Organization(id=1,
                                           name='Timetables Inc.',
                                           identifiers=[AgentIdentifier()]))
            ], [
                Creator(1,
                        id=1,
                        order_cited=20,
                        cited_as='Timetables Inc.',
                        agent=Organization(id=1,
                                           name='Timetables Inc.',
                                           identifiers=[AgentIdentifier()]))
            ]),
            # same identifier, different name, accept longest alphabetize
            ([
                Creator(cited_as='Cooking Institute',
                        agent=Organization(
                            id=1,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=1)])),
                Contributor(cited_as='Cooking Instituze',
                            agent=Organization(
                                id=2,
                                name='Cooking Instituze',
                                identifiers=[AgentIdentifier(1, id=2)])),
                Funder(cited_as='Cook Institute',
                       agent=Organization(
                           id=3,
                           name='Cook Institute',
                           identifiers=[AgentIdentifier(1, id=3)]))
            ], [
                Creator(cited_as='Cooking Institute',
                        agent=Institution(
                            id=3,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=3)])),
                Funder(cited_as='Cook Institute',
                       agent=Institution(
                           id=3,
                           name='Cooking Institute',
                           identifiers=[AgentIdentifier(1, id=3)]))
            ]),
            # same identifier, different name, different type, accept longest alphabetize, more specific
            ([
                Creator(cited_as='Cooking Institute',
                        order_cited=10,
                        agent=Institution(
                            id=0,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=1)])),
                Contributor(cited_as='Cooking Instituze',
                            order_cited=3,
                            agent=Organization(
                                id=1,
                                name='Cooking Instituze',
                                identifiers=[AgentIdentifier(1, id=2)])),
                Funder(cited_as='Cook Institute',
                       agent=Institution(
                           id=2,
                           name='Cook Institute',
                           identifiers=[AgentIdentifier(1, id=3)]))
            ], [
                Creator(cited_as='Cooking Institute',
                        order_cited=10,
                        agent=Institution(
                            id=2,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=3)])),
                Funder(cited_as='Cook Institute',
                       agent=Institution(
                           id=2,
                           name='Cooking Institute',
                           identifiers=[AgentIdentifier(1, id=3)]))
            ]),
            # Related agent removed
            ([
                Creator(cited_as='',
                        agent=Person(id=0,
                                     name='None',
                                     identifiers=[AgentIdentifier(1, id=1)])),
            ], [])
        ])
    def test_normalize_contributor_creator_relation(self, input, output,
                                                    Graph):
        graph = ChangeGraph(Graph(CreativeWork(agent_relations=input)))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(CreativeWork(agent_relations=output))

    # test work with related work
    @pytest.mark.parametrize(
        'input, output',
        [
            # different identifiers
            (CreativeWork(1, related_works=[CreativeWork(2)]),
             CreativeWork(1, related_works=[CreativeWork(2)])),
            # same identifiers
            (CreativeWork(1, id=1, related_works=[CreativeWork(1, id=1)
                                                  ]), CreativeWork(1, id=1)),
            # same and different identifiers
            (CreativeWork(
                1,
                id=1,
                related_works=[CreativeWork(2, id=2),
                               CreativeWork(1, id=1)]),
             CreativeWork(1, id=1, related_works=[CreativeWork(2, id=2)]))
        ])
    def test_normalize_related_work(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(output)

    @pytest.mark.parametrize('input, output', [
        ({
            'title': '',
            'description': ''
        }, {
            'title': '',
            'description': ''
        }),
        ({
            'title': '    ',
            'description': '     '
        }, {
            'title': '',
            'description': ''
        }),
        ({
            'title': 'Title\nLine'
        }, {
            'title': 'Title Line'
        }),
        ({
            'description': 'Line\nAfter\nLine\nAfter\nLine'
        }, {
            'description': 'Line After Line After Line'
        }),
        ({
            'description': 'null'
        }, {
            'description': ''
        }),
    ])
    def test_normalize_creativework(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(**input)))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(CreativeWork(**output))

    @pytest.mark.parametrize('input, output', [(
        input,
        Creator(cited_as='James Bond',
                agent=Person(name='James Bond',
                             family_name='Bond',
                             given_name='James')),
    ) for input in [
        Creator(cited_as='   \t James\n Bond \t     ',
                agent=Person(name='James  Bond')),
        Creator(cited_as='', agent=Person(name='James  Bond')),
        Creator(cited_as='', agent=Person(name='James      Bond')),
        Creator(cited_as='',
                agent=Person(given_name='James', family_name='Bond')),
    ]] + [(
        input,
        Contributor(cited_as='James Bond',
                    agent=Person(name='James Bond',
                                 family_name='Bond',
                                 given_name='James')),
    ) for input in [
        Contributor(cited_as='   \t James\n Bond \t     ',
                    agent=Person(name='James  Bond')),
        Contributor(cited_as='', agent=Person(name='James  Bond')),
    ]])
    def test_normalize_agentworkrelation(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(output)
Exemple #3
0
 def test_normalize_creativework(self, input, output, Graph):
     graph = ChangeGraph(Graph(CreativeWork(**input)))
     graph.process(disambiguate=False)
     assert graph.serialize() == Graph(CreativeWork(**output))
Exemple #4
0
 def test_normalize_contributor_creator_relation(self, input, output,
                                                 Graph):
     graph = ChangeGraph(Graph(CreativeWork(agent_relations=input)))
     graph.process(disambiguate=False)
     assert graph.serialize() == Graph(CreativeWork(agent_relations=output))
Exemple #5
0
    def test_normalize_tag(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(tags=[input])))
        graph.process(disambiguate=False)

        assert graph.serialize() == Graph(CreativeWork(tags=output))
Exemple #6
0
 def test_normalize_tags_on_work(self, input, output, Graph, ExpectedGraph):
     graph = Graph(CreativeWork(tags=input))
     Regulator(validate=False).regulate(graph)
     assert graph == ExpectedGraph(CreativeWork(tags=output))
Exemple #7
0
 def test_normalize_contributor_creator_relation(self, input, output, Graph,
                                                 ExpectedGraph):
     graph = Graph(CreativeWork(agent_relations=input))
     Regulator(validate=False).regulate(graph)
     assert graph == ExpectedGraph(CreativeWork(agent_relations=output))
Exemple #8
0
class TestModelNormalization:

    # test each tag resolves to lowercased, tokenized name
    @pytest.mark.parametrize('input, output', [(i, o) for input, o in [
        ([
            Tag(name=''),
            Tag(name='        '),
            Tag(name='\n\n\n'),
        ], []),
        ([
            Tag(name='foo'),
            Tag(name='foO'),
            Tag(name='Foo'),
            Tag(name='FOO'),
            Tag(name='      FOO'),
            Tag(name='      foo\n\n\n'),
        ], [Tag(name='foo')]),
        ([
            Tag(name='Rocket League'),
            Tag(name='rocket league'),
            Tag(name='ROCKET LEAGUE'),
            Tag(name='Rocket         League'),
            Tag(name='\nRocket    \n     League\t'),
            Tag(name='rocket\nleague'),
        ], [Tag(name='rocket league')]),
        ([
            Tag(name='Crash; Bandicoot'),
            Tag(name='Crash;           Bandicoot'),
            Tag(name='\nCrash; Bandicoot'),
            Tag(name='crash, bandicoot'),
            Tag(name='Crash ,Bandicoot           '),
        ], [Tag(name='bandicoot'), Tag(name='crash')]),
    ] for i in input])
    def test_normalize_tag(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(tags=[input])))
        graph.process(disambiguate=False)

        assert graph.serialize() == Graph(CreativeWork(tags=output))

    # test tags with the same name are merged on a work
    @pytest.mark.parametrize('input, output', [
        ([
            Tag(name=''),
            Tag(name='        '),
            Tag(name='\n\n\n'),
        ], []),
        ([
            Tag(name='foo'),
            Tag(name='foO'),
            Tag(name='Foo'),
            Tag(name='FOO'),
            Tag(name='      FOO'),
            Tag(name='      foo\n\n\n'),
        ], [Tag(name='foo')]),
        ([
            Tag(name='Rocket League'),
            Tag(name='rocket league'),
            Tag(name='ROCKET LEAGUE'),
            Tag(name='Rocket         League'),
            Tag(name='\nRocket    \n     League\t'),
            Tag(name='rocket\nleague'),
        ], [Tag(name='rocket league')]),
        ([
            Tag(name='Crash; Bandicoot'),
            Tag(name='Crash;           Bandicoot'),
            Tag(name='\nCrash; Bandicoot'),
            Tag(name='crash, bandicoot'),
            Tag(name='Crash ,Bandicoot           '),
        ], [Tag(name='bandicoot'), Tag(name='crash')]),
    ])
    def test_normalize_tags_on_work(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(tags=input)))
        graph.normalize()
        graph.prune()
        assert [
            n.serialize()
            for n in sorted(graph.nodes, key=lambda x: x.type + str(x.id))
        ] == Graph(CreativeWork(tags=output))

    @pytest.mark.parametrize(
        'input, output',
        [(i, o)
         for input, o in [([
             Person(name='Smith, J'),
             Person(name='J    Smith   '),
             Person(name='Smith,     J'),
             Person(given_name='J', family_name='Smith'),
             Person(given_name='  J', family_name='\n\nSmith'),
         ], Person(name='J Smith', family_name='Smith', given_name='J')),
                          ([
                              Person(name='Johnathan James Doe'),
                              Person(name='johnathan james doe'),
                          ],
                           Person(name='Johnathan James Doe',
                                  family_name='Doe',
                                  given_name='Johnathan',
                                  additional_name='James')),
                          ([
                              Person(name='johnathan james doe JR'),
                          ],
                           Person(name='Johnathan James Doe Jr',
                                  family_name='Doe',
                                  given_name='Johnathan',
                                  additional_name='James',
                                  suffix='Jr')),
                          ([
                              Person(name='none'),
                              Person(name=''),
                              Person(name='NULL'),
                              Person(name='None'),
                              Person(name='           '),
                              Person(name='     None      '),
                          ], None)] for i in input])
    def test_normalize_person(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == (Graph(output) if output else [])

    # test two people with the same identifier are merged
    # sort by length and then alphabetize name field
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifier
            ([
                Person(0, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(1, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(2, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
            ], [
                Person(2,
                       name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            ([
                Person(0, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(1, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(2, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(3, name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
            ], [
                Person(3,
                       name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # same name, different identifiers
            ([
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(2)])
            ], [
                Person(name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1, parse=True)]),
                Person(name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(2, parse=True)])
            ]),
            # no name - name, same identifier
            ([
                Person(name='', identifiers=[AgentIdentifier(1)]),
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)])
            ], [
                Person(name='Barb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # two names, same identifier, take longer name
            ([
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(name='Barbra Dylan', identifiers=[AgentIdentifier(1)])
            ], [
                Person(name='Barbra Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # two sames, same length, same identifier, alphabetize and take first
            ([
                Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)]),
                Person(name='Aarb Dylan', identifiers=[AgentIdentifier(1)])
            ], [
                Person(name='Aarb Dylan',
                       parse=True,
                       identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # 3 different names, take longest of each name field
            (
                [
                    # Below case WILL FAIL. Haven't seen just a last name... yet
                    # Person(name='Dylan', identifiers=[AgentIdentifier(1)]),
                    Person(name='Dylan, B', identifiers=[AgentIdentifier(1)]),
                    Person(name='Barb Dylan', identifiers=[AgentIdentifier(1)
                                                           ]),
                    Person(name='B. D. Dylan',
                           identifiers=[AgentIdentifier(1)])
                ],
                [
                    Person(name='Barb D. Dylan',
                           parse=True,
                           identifiers=[AgentIdentifier(1, parse=True)])
                ]),
        ])
    def test_normalize_person_relation(self, input, output, Graph):
        graph = ChangeGraph(Graph(*input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(*output)

    @pytest.mark.parametrize('input, output', [
        (Agent(name='none'), None),
        (Agent(name=''), None),
        (Agent(name='NULL'), None),
        (Agent(name='None'), None),
        (Agent(name='           '), None),
        (Agent(name='     None      '), None),
        (Agent(name='     Empty Foundation      '),
         Organization(name='Empty Foundation')),
        (Agent(name='University \n of Arizona '),
         Institution(name='University of Arizona')),
        (Agent(name='NMRC, University College, Cork, Ireland'),
         Institution(name='NMRC, University College',
                     location='Cork, Ireland')),
        (Agent(name='Ioffe Physico-Technical Institute'),
         Institution(name='Ioffe Physico-Technical Institute')),
        (Agent(name='DPTA'), Organization(name='DPTA')),
        (Agent(
            name=
            'B. Verkin Institute for Low Temperatures Physics & Engineering, Kharkov, Ukraine'
        ),
         Institution(
             name=
             'B. Verkin Institute for Low Temperatures Physics & Engineering',
             location='Kharkov, Ukraine',
             type='institution')),
        (Agent(name='Physikalisches Institut, University Wuerzburg, Germany'),
         Agent(name='Physikalisches Institut',
               location='University Wuerzburg, Germany',
               type='institution')),
        (Agent(
            name=
            'Centro de Biotecnologia e Departamento de Biofísica; UFRGS; Av Bento Goncalves 9500, Predio 43431 sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi'
        ),
         Agent(
             name='UFRGS - Centro de Biotecnologia e Departamento de Biofísica',
             location=
             'Av Bento Goncalves 9500, Predio 43431 Sala 213 91501-970 Porto Alegre Rio Grande do Sul Brazi'
         )),
        (Agent(
            name=
            'Department of Chemistry; ZheJiang University; HangZhou ZheJiang CHINA'
        ),
         Institution(name='ZheJiang University - Department of Chemistry',
                     location='HangZhou ZheJiang CHINA')),
        (Agent(
            name=
            'Marine Evolution and Conservation; Groningen Institute for Evolutionary Life Sciences; University of Groningen; Nijenborgh 7, 9747 AG Groningen The Netherlands'
        ),
         Institution(
             name=
             'University of Groningen - Marine Evolution and Conservation; Groningen Institute for Evolutionary Life Sciences',
             location='Nijenborgh 7, 9747 AG Groningen The Netherlands')),
        (Agent(
            name=
            'Institute of Marine Research; PO Box 1870 Nordnes, 5817 Bergen Norway'
        ),
         Institution(name='Institute of Marine Research',
                     location='PO Box 1870 Nordnes, 5817 Bergen Norway')),
        (Agent(name='    PeerJ    Inc.    '), Organization(name='PeerJ Inc.')),
        (Agent(name=' Clinton   Foundation\n   '),
         Organization(name='Clinton Foundation')),
    ])
    def test_normalize_agent(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == (Graph(output) if output else [])

    # test two organizations/institutions with the same name are merged
    # sort by length and then alphabetize name field
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifiers
            ([
                Organization(name='American Heart Association',
                             identifiers=[AgentIdentifier(1)]),
                Organization(name='American Heart Association',
                             identifiers=[AgentIdentifier(1)])
            ], [
                Organization(name='American Heart Association',
                             identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # same name, different identifiers
            ([
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(1)]),
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(2)])
            ], [
                Organization(name='Money Foundation',
                             identifiers=[
                                 AgentIdentifier(1, parse=True),
                                 AgentIdentifier(2, parse=True)
                             ])
            ]),
            # same name, different identifiers, different capitilization
            ([
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(1)]),
                Organization(name='MONEY FOUNDATION',
                             identifiers=[AgentIdentifier(2)])
            ], [
                Organization(name='Money Foundation',
                             identifiers=[AgentIdentifier(1, parse=True)]),
                Organization(name='MONEY FOUNDATION',
                             identifiers=[AgentIdentifier(2, parse=True)])
            ]),
            # same identifier, different type, accept more specific type
            ([
                Institution(name='University of Virginia',
                            identifiers=[AgentIdentifier(1)]),
                Organization(name='University of Virginia',
                             identifiers=[AgentIdentifier(1)]),
            ], [
                Institution(name='University of Virginia',
                            identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # same identifier, same name, same length, different capitilization, alphabetize
            ([
                Organization(name='Share', identifiers=[AgentIdentifier(1)]),
                Organization(name='SHARE', identifiers=[AgentIdentifier(1)])
            ], [
                Organization(name='SHARE',
                             identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # same name, one identifier, add identifier
            ([
                Organization(name='Timetables Inc.'),
                Organization(name='Timetables Inc.',
                             identifiers=[AgentIdentifier(1)])
            ], [
                Organization(name='Timetables Inc.',
                             identifiers=[AgentIdentifier(1, parse=True)])
            ]),
            # same identifier, different name, accept longest alphabetize
            ([
                Institution(name='Cooking Institute',
                            identifiers=[AgentIdentifier(1)]),
                Institution(name='Cooking Instituze',
                            identifiers=[AgentIdentifier(1)]),
                Institution(name='Cook Institute',
                            identifiers=[AgentIdentifier(1)])
            ], [
                Institution(name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, parse=True)])
            ]),
        ])
    def test_normalize_organization_institution_name(self, input, output,
                                                     Graph):
        graph = ChangeGraph(Graph(*input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(*output)

    # test different types of agent work relations
    # Funder, Publisher, Host
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifiers
            ([
                Funder(cited_as='American Heart Association',
                       agent=Organization(
                           1,
                           id=1,
                           name='American Heart Association',
                           identifiers=[AgentIdentifier(1, id=0)])),
                Host(cited_as='American Heart Association',
                     agent=Organization(2,
                                        id=1,
                                        name='American Heart Association',
                                        identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='American Heart Association',
                       agent=Organization(
                           1,
                           id=1,
                           name='American Heart Association',
                           identifiers=[AgentIdentifier(1, id=0, parse=True)
                                        ])),
                Host(cited_as='American Heart Association',
                     agent=Organization(
                         1,
                         id=1,
                         name='American Heart Association',
                         identifiers=[AgentIdentifier(1, id=0, parse=True)]))
            ]),
            # same name, different identifiers
            ([
                Funder(cited_as='Money Foundation',
                       agent=Organization(id=1,
                                          name='Money Foundation',
                                          identifiers=[AgentIdentifier(1)])),
                Host(cited_as='Money Foundation',
                     agent=Organization(name='Money Foundation',
                                        identifiers=[AgentIdentifier(2)]))
            ], [
                Funder(cited_as='Money Foundation',
                       agent=Organization(id=1,
                                          name='Money Foundation',
                                          identifiers=[
                                              AgentIdentifier(1, parse=True),
                                              AgentIdentifier(2, parse=True)
                                          ])),
                Host(cited_as='Money Foundation',
                     agent=Organization(id=1, name='Money Foundation'))
            ]),
            # same identifier, different type
            ([
                Funder(cited_as='University of Virginia',
                       agent=Institution(id=1,
                                         name='University of Virginia',
                                         identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='University of Virginia',
                          agent=Institution(
                              name='University of Virginia',
                              identifiers=[AgentIdentifier(1, id=0)]))
            ], [
                Funder(cited_as='University of Virginia',
                       agent=Institution(
                           id=1,
                           name='University of Virginia',
                           identifiers=[AgentIdentifier(1, id=0, parse=True)
                                        ])),
                Publisher(cited_as='University of Virginia',
                          agent=Institution(id=1,
                                            name='University of Virginia'))
            ]),
            # same identifier, same name, same length, different capitilization, alphabetize
            ([
                Publisher(cited_as='Share',
                          agent=Organization(
                              id=0,
                              name='Share',
                              identifiers=[AgentIdentifier(1, id=2)])),
                Host(cited_as='SHARE',
                     agent=Organization(id=1,
                                        name='SHARE',
                                        identifiers=[AgentIdentifier(1, id=3)
                                                     ]))
            ], [
                Publisher(
                    cited_as='Share',
                    agent=Organization(
                        id=1,
                        name='SHARE',
                        identifiers=[AgentIdentifier(1, id=3, parse=True)])),
                Host(cited_as='SHARE', agent=Organization(id=1, name='SHARE'))
            ]),
            # same name, one identifier, add identifier
            ([
                Funder(cited_as='Timetables Inc.',
                       agent=Organization(id=1, name='Timetables Inc.')),
                Publisher(cited_as='Timetables Inc.',
                          agent=Organization(id=2,
                                             name='Timetables Inc.',
                                             identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='Timetables Inc.',
                       agent=Organization(
                           id=2,
                           name='Timetables Inc.',
                           identifiers=[AgentIdentifier(1, parse=True)])),
                Publisher(cited_as='Timetables Inc.', agent=Organization(id=2))
            ]),
            # same identifier, different name, accept longest alphabetize
            ([
                Funder(cited_as='Cooking Institute',
                       agent=Organization(id=1,
                                          name='Cooking Notaninstitute',
                                          identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Organization(id=2,
                                             name='Cooking Notaninstituze',
                                             identifiers=[AgentIdentifier(1)
                                                          ])),
                Host(cited_as='Cook Institute',
                     agent=Organization(id=3,
                                        name='Cook Notaninstitute',
                                        identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='Cooking Institute',
                       agent=Organization(
                           id=3,
                           name='Cooking Notaninstitute',
                           identifiers=[AgentIdentifier(1, parse=True)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Organization(id=3)),
                Host(cited_as='Cook Institute', agent=Organization(id=3))
            ]),
            # same identifier, different name, different type, accept longest alphabetize, more specific
            ([
                Funder(cited_as='Cooking Institute',
                       agent=Institution(id=1,
                                         name='Cooking Notaninstitute',
                                         identifiers=[AgentIdentifier(1)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Organization(id=2,
                                             name='Cooking Notaninstituze',
                                             identifiers=[AgentIdentifier(1)
                                                          ])),
                Host(cited_as='Cook Institute',
                     agent=Institution(id=3,
                                       name='Cook Notaninstitute',
                                       identifiers=[AgentIdentifier(1)]))
            ], [
                Funder(cited_as='Cooking Institute',
                       agent=Institution(
                           id=3,
                           name='Cooking Notaninstitute',
                           identifiers=[AgentIdentifier(1, parse=True)])),
                Publisher(cited_as='Cooking Instituze',
                          agent=Institution(id=3)),
                Host(cited_as='Cook Institute', agent=Institution(id=3))
            ]),
        ])
    def test_normalize_mixed_agent_relation(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(agent_relations=input)))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(CreativeWork(agent_relations=output))

    # test different types of agent work relations
    # Contributor, Creator
    @pytest.mark.parametrize(
        'input, output',
        [
            # same name, same identifiers, different type, same type tree, organization
            ([
                Creator(cited_as='American Heart Association',
                        agent=Organization(
                            id=0,
                            name='American Heart Association',
                            identifiers=[AgentIdentifier(1, id=1)])),
                Contributor(cited_as='American Heart Association',
                            agent=Organization(
                                id=1,
                                name='American Heart Association',
                                identifiers=[AgentIdentifier(1, id=2)]))
            ], [
                Creator(cited_as='American Heart Association',
                        agent=Organization(
                            id=1,
                            name='American Heart Association',
                            identifiers=[AgentIdentifier(1, id=2, parse=True)
                                         ]))
            ]),
            # same name, different identifiers, different type, same type tree
            ([
                Creator(cited_as='Money Foundation',
                        agent=Organization(id=1,
                                           name='Money Foundation',
                                           identifiers=[AgentIdentifier()])),
                Contributor(cited_as='Money Foundation',
                            agent=Organization(id=2,
                                               name='Money Foundation',
                                               identifiers=[AgentIdentifier()
                                                            ])),
            ], [
                Creator(cited_as='Money Foundation',
                        agent=Organization(id=2,
                                           name='Money Foundation',
                                           identifiers=[
                                               AgentIdentifier(parse=True),
                                               AgentIdentifier(parse=True)
                                           ]))
            ]),
            # same identifier, same name, different type
            ([
                Contributor(cited_as='University of Virginia',
                            agent=Institution(
                                id=0,
                                name='University of Virginia',
                                identifiers=[AgentIdentifier(1, id=1)])),
                Publisher(cited_as='University of Virginia',
                          agent=Institution(
                              id=1,
                              name='University of Virginia',
                              identifiers=[AgentIdentifier(1, id=1)]))
            ], [
                Contributor(
                    cited_as='University of Virginia',
                    agent=Institution(
                        id=1,
                        name='University of Virginia',
                        identifiers=[AgentIdentifier(1, id=1, parse=True)])),
                Publisher(
                    cited_as='University of Virginia',
                    agent=Institution(
                        id=1,
                        name='University of Virginia',
                        identifiers=[AgentIdentifier(1, id=1, parse=True)]))
            ]),
            # same identifier, same name, different type, same type tree, person
            ([
                Creator(cited_as='Bob Dylan',
                        agent=Person(id=0,
                                     name='Bob Dylan',
                                     identifiers=[AgentIdentifier(1, id=0)])),
                Contributor(cited_as='Bob Dylan',
                            agent=Person(
                                id=1,
                                name='Bob Dylan',
                                identifiers=[AgentIdentifier(1, id=1)])),
            ], [
                Creator(cited_as='Bob Dylan',
                        agent=Person(id=1,
                                     name='Bob Dylan',
                                     given_name='Bob',
                                     family_name='Dylan',
                                     identifiers=[
                                         AgentIdentifier(1, id=1, parse=True)
                                     ]))
            ]),
            # same identifier, different name, different type
            ([
                Creator(cited_as='B. Dylan',
                        agent=Person(id=0,
                                     name='B. Dylan',
                                     identifiers=[AgentIdentifier(1, id=0)])),
                Contributor(cited_as='Bob Dylan',
                            agent=Person(
                                id=1,
                                name='Bob Dylan',
                                identifiers=[AgentIdentifier(1, id=1)])),
            ], [
                Creator(cited_as='Bob Dylan',
                        agent=Person(id=1,
                                     name='Bob Dylan',
                                     given_name='Bob',
                                     family_name='Dylan',
                                     identifiers=[
                                         AgentIdentifier(1, id=1, parse=True)
                                     ]))
            ]),
            # same name, one identifier, add identifier
            ([
                Creator(1,
                        id=0,
                        order_cited=4,
                        cited_as='Timetables Inc.',
                        agent=Organization(id=0, name='Timetables Inc.')),
                Creator(1,
                        id=1,
                        order_cited=20,
                        cited_as='Timetables Inc.',
                        agent=Organization(id=1,
                                           name='Timetables Inc.',
                                           identifiers=[AgentIdentifier()]))
            ], [
                Creator(1,
                        id=1,
                        order_cited=20,
                        cited_as='Timetables Inc.',
                        agent=Organization(
                            id=1,
                            name='Timetables Inc.',
                            identifiers=[AgentIdentifier(parse=True)]))
            ]),
            # same identifier, different name, accept longest alphabetize
            ([
                Creator(cited_as='Cooking Institute',
                        agent=Organization(
                            id=1,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=1)])),
                Contributor(cited_as='Cooking Instituze',
                            agent=Organization(
                                id=2,
                                name='Cooking Instituze',
                                identifiers=[AgentIdentifier(1, id=2)])),
                Funder(cited_as='Cook Institute',
                       agent=Organization(
                           id=3,
                           name='Cook Institute',
                           identifiers=[AgentIdentifier(1, id=3)]))
            ], [
                Creator(cited_as='Cooking Institute',
                        agent=Institution(
                            id=3,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=3, parse=True)
                                         ])),
                Funder(cited_as='Cook Institute',
                       agent=Institution(
                           id=3,
                           name='Cooking Institute',
                           identifiers=[AgentIdentifier(1, id=3, parse=True)]))
            ]),
            # same identifier, different name, different type, accept longest alphabetize, more specific
            ([
                Creator(cited_as='Cooking Institute',
                        order_cited=10,
                        agent=Institution(
                            id=0,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=1)])),
                Contributor(cited_as='Cooking Instituze',
                            order_cited=3,
                            agent=Organization(
                                id=1,
                                name='Cooking Instituze',
                                identifiers=[AgentIdentifier(1, id=2)])),
                Funder(cited_as='Cook Institute',
                       agent=Institution(
                           id=2,
                           name='Cook Institute',
                           identifiers=[AgentIdentifier(1, id=3)]))
            ], [
                Creator(cited_as='Cooking Institute',
                        order_cited=10,
                        agent=Institution(
                            id=2,
                            name='Cooking Institute',
                            identifiers=[AgentIdentifier(1, id=3, parse=True)
                                         ])),
                Funder(cited_as='Cook Institute',
                       agent=Institution(
                           id=2,
                           name='Cooking Institute',
                           identifiers=[AgentIdentifier(1, id=3, parse=True)]))
            ]),
            # Related agent removed
            ([
                Creator(cited_as='',
                        agent=Person(id=0,
                                     name='None',
                                     identifiers=[AgentIdentifier(1, id=1)])),
            ], [])
        ])
    def test_normalize_contributor_creator_relation(self, input, output,
                                                    Graph):
        graph = ChangeGraph(Graph(CreativeWork(agent_relations=input)))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(CreativeWork(agent_relations=output))

    # test work with related work
    @pytest.mark.parametrize(
        'input, output',
        [
            # different identifiers
            (CreativeWork(1, related_works=[CreativeWork(2)]),
             CreativeWork(1, related_works=[CreativeWork(2)])),
            # same identifiers
            (CreativeWork(1, id=1, related_works=[CreativeWork(1, id=1)
                                                  ]), CreativeWork(1, id=1)),
            # same and different identifiers
            (CreativeWork(
                1,
                id=1,
                related_works=[CreativeWork(2, id=2),
                               CreativeWork(1, id=1)]),
             CreativeWork(1, id=1, related_works=[CreativeWork(2, id=2)]))
        ])
    def test_normalize_related_work(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(output)

    @pytest.mark.parametrize('input, output', [
        ({
            'title': '',
            'description': ''
        }, {
            'title': '',
            'description': ''
        }),
        ({
            'title': '    ',
            'description': '     '
        }, {
            'title': '',
            'description': ''
        }),
        ({
            'title': 'Title\nLine'
        }, {
            'title': 'Title Line'
        }),
        ({
            'description': 'Line\nAfter\nLine\nAfter\nLine'
        }, {
            'description': 'Line After Line After Line'
        }),
        ({
            'description': 'null'
        }, {
            'description': ''
        }),
    ])
    def test_normalize_creativework(self, input, output, Graph):
        graph = ChangeGraph(Graph(CreativeWork(**input)))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(CreativeWork(**output))

    @pytest.mark.parametrize('input, output', [
        ('', None),
        ('htp://google.com', None),
        ('blackmagic://goat.hooves', None),
        ('1476-4687 ', None),
        ('urn://issn/1476-4687', None),
        ('0000000248692412', None),
        ('https://orcid.org/0000-0002-1694-233X', None),
        ('*****@*****.**', None),
        ('10.517ccdc.csd.c>c1lj81f', None),
        ('http://arxiv.org/index.php?view&id=12',
         'http://arxiv.org/index.php?view&id=12'),
        ('10.5517/ccdc.csd.cc1lj81f',
         'http://dx.doi.org/10.5517/CCDC.CSD.CC1LJ81F'),
        ('   arxiv:1212.20282    ', 'http://arxiv.org/abs/1212.20282'),
        ('oai:subdomain.cos.io:this.is.stuff',
         'oai://subdomain.cos.io/this.is.stuff'),
        ('Beau, R <http://researchonline.lshtm.ac.uk/view/creators/999461.html>;  Douglas, I <http://researchonline.lshtm.ac.uk/view/creators/103524.html>;  Evans, S <http://researchonline.lshtm.ac.uk/view/creators/101520.html>;  Clayton, T <http://researchonline.lshtm.ac.uk/view/creators/11213.html>;  Smeeth, L <http://researchonline.lshtm.ac.uk/view/creators/13212.html>;      (2011) How Long Do Children Stay on Antiepileptic Treatments in the UK?  [Conference or Workshop Item]',
         None),
    ])
    def test_normalize_workidentifier(self, input, output, Graph):
        graph = ChangeGraph(
            Graph(WorkIdentifier(uri=input, creative_work=None)))
        graph.process(disambiguate=False)
        assert graph.serialize() == (Graph(
            WorkIdentifier(uri=output, parse=True, creative_work=None))
                                     if output else [])

    @pytest.mark.parametrize('input, output', [
        ('', None),
        ('             ', None),
        ('0000000248692412', None),
        ('000000000248692419', None),
        ('urn://issn/1476-4687', 'urn://issn/1476-4687'),
        ('0000000248692419', 'http://orcid.org/0000-0002-4869-2419'),
        ('0000-0002-4869-2419', 'http://orcid.org/0000-0002-4869-2419'),
        ('0000-0002-4869-2419', 'http://orcid.org/0000-0002-4869-2419'),
        ('Beau, R <http://researchonline.lshtm.ac.uk/view/creators/999461.html>;  Douglas, I <http://researchonline.lshtm.ac.uk/view/creators/103524.html>;  Evans, S <http://researchonline.lshtm.ac.uk/view/creators/101520.html>;  Clayton, T <http://researchonline.lshtm.ac.uk/view/creators/11213.html>;  Smeeth, L <http://researchonline.lshtm.ac.uk/view/creators/13212.html>;      (2011) How Long Do Children Stay on Antiepileptic Treatments in the UK?  [Conference or Workshop Item]',
         None),
    ])
    def test_normalize_agentidentifier(self, input, output, Graph):
        graph = ChangeGraph(Graph(AgentIdentifier(uri=input, agent=None)))
        graph.process(disambiguate=False)
        assert graph.serialize() == (Graph(
            AgentIdentifier(uri=output, parse=True, agent=None))
                                     if output else [])

    @pytest.mark.parametrize('input, output', [(
        input,
        Creator(cited_as='James Bond',
                agent=Person(name='James Bond',
                             family_name='Bond',
                             given_name='James')),
    ) for input in [
        Creator(cited_as='   \t James\n Bond \t     ',
                agent=Person(name='James  Bond')),
        Creator(cited_as='', agent=Person(name='James  Bond')),
        Creator(cited_as='', agent=Person(name='James      Bond')),
        Creator(cited_as='',
                agent=Person(given_name='James', family_name='Bond')),
    ]] + [(
        input,
        Contributor(cited_as='James Bond',
                    agent=Person(name='James Bond',
                                 family_name='Bond',
                                 given_name='James')),
    ) for input in [
        Contributor(cited_as='   \t James\n Bond \t     ',
                    agent=Person(name='James  Bond')),
        Contributor(cited_as='', agent=Person(name='James  Bond')),
    ]])
    def test_normalize_agentworkrelation(self, input, output, Graph):
        graph = ChangeGraph(Graph(input))
        graph.process(disambiguate=False)
        assert graph.serialize() == Graph(output)