예제 #1
0
    def get_dblp_author_from_zh(author_name):

        author_cdblp = CDBLPAuthor(author_name)
        author_name_comp = CDBLPAuthor.getEnglishName(author_name)

        urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name'])

        candidate_urlpts = set()
        candidate_authors = []

        res = urlopen(DBLPQuery.get_dblp_url(urlpt))
        dom = BeautifulSoup(res)

        for cu_tag in dom.find_all('li', 'homonym'):
            cu = cu_tag.find('a')['href'][3:-5]
            candidate_urlpts.add(cu)

        if len(candidate_urlpts) == 0:
            candidate_urlpts.add(urlpt)

        for cu in candidate_urlpts:
            author = DBLPAuthor(cu)
            candidate_authors.append(author)
            print(cu)

        # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP
        if len(candidate_authors) == 0 and len(author_name) == 3:
            res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash'])))
            dom = BeautifulSoup(res)

            author_tags = dom.find_all('author')
            for author_tag in author_tags:
                if author_tag.string == author_name_comp['full_name_dash']:
                    author = DBLPAuthor(author_tag['urlpt'])
                    candidate_authors.append(author)

        try:
            target_author = candidate_authors[0]
        except IndexError:
            return { 'cdblp': author_cdblp.get_author(), 'dblp': {} }

        coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors()))
        coauthors_set_dblp  = set(candidate_authors[0].get_coauthors())
        coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp)))
        overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

        if len(candidate_authors) > 1:
            for candidate in candidate_authors:
                coauthors_set_dblp  = set(candidate.get_coauthors())
                coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

                if len(coauthor_overlap) >= coauthor_count_max:
                    target_author = candidate
                    overlap = coauthor_overlap

        else:
            target_author = candidate_authors[0]

        #print(overlap)
        if coauthor_count_max == 0:
            return { 'cdblp': author_cdblp.get_author(), 'dblp': {} }

        return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() }
예제 #2
0
    def author_distinct(cached_list, cached_set, author_name):
        trial = author_name[0]

        if author_name in cached_set:
            d = DBLPQuery.get_cache('author-entries-cache.data')
            print('This is a CDBLP author w/ a English name on file.')
            author_name_zh = ''
            for author_name_comp in cached_list:
                if author_name_comp['full_name'] == author_name.strip() or author_name_comp['zh'] == author_name:
                    author_name_zh = author_name_comp['zh']
                    break

            return {
                'cdblp': d.get(author_name_zh, {}),
                'dblp': DBLPAuthor('{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name'])).get_author()
            }

        else:
            if 0x3400 < ord(trial) < 0x2b6f8:
                print('This is a CDBLP author w/ a Chinese name.')

                author_cdblp = CDBLPAuthor(author_name)
                author_name_comp = CDBLPAuthor.getEnglishName(author_name)

                res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name'])))
                dom = BeautifulSoup(res)

                candidate_authors = []
                author_tags = dom.find_all('author')

                for author_tag in author_tags:
                    if author_tag.string == author_name_comp['full_name']:
                        author = DBLPAuthor(author_tag['urlpt'])
                        candidate_authors.append(author)

                # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP
                if len(candidate_authors) == 0 and len(author_name) == 3:
                    res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash'])))
                    dom = BeautifulSoup(res)

                    author_tags = dom.find_all('author')
                    for author_tag in author_tags:
                        if author_tag.string == author_name_comp['full_name_dash']:
                            author = DBLPAuthor(author_tag['urlpt'])
                            candidate_authors.append(author)

                try:
                    target_author = candidate_authors[0]
                except IndexError:
                    return { 'cdblp': author_cdblp.get_author(), 'dblp': {} }

                coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors()))
                coauthors_set_dblp  = set(candidate_authors[0].get_coauthors())
                coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp)))
                overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

                if len(candidate_authors) > 1:
                    for candidate in candidate_authors:
                        coauthors_set_dblp  = set(candidate.get_coauthors())
                        coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp)

                        if len(coauthor_overlap) >= coauthor_count_max:
                            target_author = candidate
                            overlap = coauthor_overlap

                else:
                    target_author = candidate_authors[0]

                #print(overlap)
                return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() }

            else:
                print('This is a non-CDBLP author.')
                candidates = []
                res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name)))
                dom = BeautifulSoup(res)
                for candidate_tag in dom.find_all('author'):
                    if author_name == candidate_tag.string:
                        candidates.append(DBLPAuthor(candidate_tag['urlpt']))

                return { 'cdblp': {}, 'dblp': candidates[0].get_author() }