Ejemplo n.º 1
0
def _test_match_score():
    corr, expected, cand = corresponding_authors[8]
    corr = Author(corr)
    print
    print corr, corr.chunks
    print
    import difflib
    scored_can = []
    for a in cand:
        p1 = ' '.join([c.name for c in sorted(corr.chunks)])
        p2 = ' '.join([c.name for c in sorted(Author(a).chunks)])
        l = levenshtein(p1, p2)
        l = float(l ** 2) / len(a) / len(corr.name)
        c = corr.match_score(Author(a))
        scored_can.append((
            Author(a),
            c,
            l,
            c - 10 * l,
            corr.distance(Author(a))
        ))

    for a, c, l, c2, d in scored_can:
        print a.chunks, c, l, c2, d

    print
    print 'Score:', max(scored_can, key=lambda c: c[1])
    print 'Leven:', min(scored_can, key=lambda c: c[2])
    print 'Mixed:', max(scored_can, key=lambda c: c[3])
    print 'Implemented:', min(scored_can, key=lambda c: c[4])
    print 'Diff:', difflib.get_close_matches(corr.name, cand, 1)
Ejemplo n.º 2
0
Archivo: wos.py Proyecto: GaretJax/irco
    def process_record(self, record):
        record['institutions'] = []
        record['authors'] = []

        affiliations = self.splitter.findall(
            record['authors_with_affiliations'].strip())

        if not affiliations:
            aut = record['AF'].split('; ')
            aff = record['authors_with_affiliations'].split('; ')
            if len(aut) == len(aff):
                affiliations = list(zip(aut, aff))
            elif len(aff) == 1:
                affiliations = [(a, aff[0]) for a in aut]
            else:
                self.pipeline.inc_metric('ambiguous_author_affiliations')
                print('-' * 80)
                print(u'Ambiguous author affiliations for "{title}":'.format(
                    **record))
                print(' Authors:')
                for a in aut:
                    print('  * {}'.format(a))
                print(' Affiliations:')
                for a in aff:
                    print('  * {}'.format(a))
                print('-' * 80)
                if self.include_ambiguous_affiliations:
                    # Set all author affiliations to the first institution in
                    # the list, and set the ambiguous flag...
                    record['ambiguous_affiliations'] = True
                    affiliations = [(a, aff[0]) for a in aut]
                else:
                    return None

        # TODO: Some authors could have two affiliations! This should be
        # checked here and a warning raised.

        for i, (authors, institution) in enumerate(affiliations):
            record['institutions'].append(institution)

            for a in authors.split('; '):
                author = Author(a)
                record['authors'].append((author, i))

        if record['RP']:
            t = ' (reprint author)'
            corresponding = record['RP'][:record['RP'].find(t)]
            corresponding = Author(corresponding.strip())
            t += ', '
            institution = record['RP'][record['RP'].find(t) + len(t):]

            match = corresponding.find_best_match(
                [a[0] for a in record['authors']])
            if not match:
                self.pipeline.inc_metric('corresponding_author_unmatched')
                print('-' * 80)
                print('No corresponding author match found for:')
                print('  {!r}/{!r}'.format(record['title'],
                                           corresponding.name))
                names = (a[0].name for a in record['authors'])
                pprint.pprint((corresponding.name, 0, tuple(names)),
                              self.unmatched_authors)
                print('-' * 80)
                return None
            else:
                for i, (a, institution_id) in enumerate(record['authors']):
                    if a is match:
                        record['corresponding_author'] = i
                        curr_inst = record['institutions'][institution_id]
                        if institution != curr_inst:
                            record['institutions'].append(institution)
                            record['authors'][i] = (
                                a, len(record['institutions']) - 1)
                        break
        else:
            self.pipeline.inc_metric('corresponding_author_undefined')
            record['corresponding_author'] = 0
            print (u'Undefined corresponding author for "{}", selecting "{}"'
                   .format(record['title'], record['authors'][0][0].name))
        return record