def run(self): author = CDBLPAuthor(self.author_name.strip()) #AuthorThread.authors[self.author_name.strip()] = author.get_author() chunk = open('authors/{}-{}-entry.data'.format(self.id, self.author_name.strip()), 'w') chunk.write(json.dumps(author.get_author())) chunk.close() print(self.author_name + ' %d is done.' % self.id)
def run(self): author = CDBLPAuthor(self.author_name.strip()) #AuthorThread.authors[self.author_name.strip()] = author.get_author() chunk = open( 'authors/{}-{}-entry.data'.format(self.id, self.author_name.strip()), 'w') chunk.write(json.dumps(author.get_author())) chunk.close() print(self.author_name + ' %d is done.' % self.id)
def get_match(author_name): Data.clear() author_cdblp = CDBLPAuthor(author_name) author_name_comp = CDBLPAuthor.getEnglishName(author_name) urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name']) candidate_urlpts = set() author_affiliation = dict() res = urlopen(DBLPQuery.get_dblp_url(urlpt)) dom = BeautifulSoup(res) for cu_tag in dom.find_all('li', 'homonym'): cu = cu_tag.find('a')['href'][3:-5] candidate_urlpts.add(cu) author_affiliation[cu] = cu_tag.find('a').next_sibling.string if len(candidate_urlpts) == 0: candidate_urlpts.add(urlpt) author_affiliation[urlpt] = 'Default University' l = [] for cu in candidate_urlpts: t = ThreadMatch(cu, author_cdblp) t.start() l.append(t) for t in l: t.join() result = [] for k, v in Data.result.items(): if v > 0.1: result.append({ 'urlpt': k, 'aff': author_affiliation[k], 'rank': v }) result.sort(key=lambda i: i['rank'], reverse=True) return {'author': author_cdblp, 'result': result}
def get_authors_by_venue(cached_list, cached_set, cdblp_venue, dblp_venue): d = DBLPQuery.get_cache('cdblp-pub-cache.data') if not d.__contains__(cdblp_venue.get('title')): print('This C-DBLP venue is not on file.') return res = urlopen('http://www.dblp.org/search/api/?q=ce:venue:{}:*&h=750&format=json'.format(dblp_venue.get('title').lower())) # fix titles as { "Title ..." } fixed_json = re.compile('({\s*)(".+")(\s*})').sub(lambda m: m.group(2), res.read().decode('utf-8')) # get publications cdblp_pubs = d.get(cdblp_venue.get('title')) dblp_pubs = json.loads(fixed_json) cdblp_authors = set() dblp_authors = set() authors = dict() #print(type(cdblp_pubs)) #print(cdblp_pubs.keys()) for ky in cdblp_pubs.keys(): for ki in cdblp_pubs.get(ky).keys(): for pub in cdblp_pubs.get(ky).get(ki): for author in pub.get('authors'): cdblp_authors.add(author) for pub in dblp_pubs.get('result').get('hits').get('hit'): try: for author in pub.get('info').get('authors').get('author'): dblp_authors.add(author) except AttributeError: print('PublicationException: %s' % pub.get('@id')) pinyin = PinYin() pinyin.load_word() for author in cdblp_authors: name_comp = CDBLPAuthor.get_english_name(author, pinyin) if name_comp['full_name'] in dblp_authors: if authors.__contains__(name_comp['full_name']): authors[name_comp['full_name']]['zh'] = name_comp['zh'] authors[name_comp['full_name']]['count'] += 1 else: authors[name_comp['full_name']] = { 'zh': name_comp['zh'], 'count': 1 } elif len(author) == 3 and authors.__contains__(name_comp['full_name_dash']): if authors.__contains__(name_comp['full_name_dash']): authors[name_comp['full_name_dash']]['zh'] = name_comp['zh'] authors[name_comp['full_name_dash']]['count'] += 1 else: authors[name_comp['full_name_dash']] = { 'zh': name_comp['zh'], 'count': 1 } return authors
def get_match(author_name): Data.clear() author_cdblp = CDBLPAuthor(author_name) author_name_comp = CDBLPAuthor.getEnglishName(author_name) urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name']) candidate_urlpts = set() author_affiliation = dict() res = urlopen(DBLPQuery.get_dblp_url(urlpt)) dom = BeautifulSoup(res) for cu_tag in dom.find_all('li', 'homonym'): cu = cu_tag.find('a')['href'][3:-5] candidate_urlpts.add(cu) author_affiliation[cu] = cu_tag.find('a').next_sibling.string if len(candidate_urlpts) == 0: candidate_urlpts.add(urlpt) author_affiliation[urlpt] = 'Default University' l = [] for cu in candidate_urlpts: t = ThreadMatch(cu, author_cdblp) t.start() l.append(t) for t in l: t.join() result = [] for k, v in Data.result.items(): if v > 0.1: result.append({ 'urlpt': k, 'aff': author_affiliation[k], 'rank': v }) result.sort(key=lambda i: i['rank'], reverse=True) return { 'author': author_cdblp, 'result': result }
def get_sample_users(): cache = open('author-cache.data', 'w') piy = PinYin() piy.load_word() author_list = [] res = urlopen('http://easyscholar.ruc.edu.cn/moreuser.html') dom = BeautifulSoup(res) author_tags = dom.find_all(href=re.compile('^homepage/')) for author_tag in author_tags: if author_tag.findChild('strong'): #print(author_tag.findChild('strong').contents) author_name = CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0]) author_list.append(author_name) #print('{} {}'.format(author_name['full_name'], author_name['zh'])) #print(CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])['full_name']) #print(piy.hanzi2pinyin(author_tag.findChild('strong').contents[0])) cache.write(json.dumps(author_list)) cache.close() return author_list
def get_coauthored_publications_by_authors(cached_list, cached_set, author1_name, author2_name): """ 1. Get authors' publications 2. Merge publications """ publications = { 'cdblp': [], 'dblp': [] } pub1 = DBLPQuery.get_publications_by_author(cached_list, cached_set, author1_name) author2 = DBLPQuery.author_distinct(cached_list, cached_set, author2_name) #pub2 = DBLPQuery.get_publications_by_author(cached_list, cached_set, author2_name) for cdblp_pub in pub1.get('cdblp', []): authors = set(cdblp_pub.get('authors', [])) authors_en = set(map(lambda a: CDBLPAuthor.getEnglishName(a)['full_name'], authors)) if author2.get('cdblp', {}).get('author_name', {}).get('zh') in authors or author2.get('dblp', {}).get('author_name') in authors_en: publications['cdblp'].append(cdblp_pub) for dblp_pub in pub1.get('dblp', []): authors = set(map(lambda a: a.get('name'), dblp_pub.get('authors', []))) if author2.get('dblp', {}).get('author_name') in authors or author2.get('cdblp', {}).get('author_name', {}).get('full_name') in authors: publications['dblp'].append(dblp_pub) return publications
def get_dblp_author_from_zh(author_name): author_cdblp = CDBLPAuthor(author_name) author_name_comp = CDBLPAuthor.getEnglishName(author_name) urlpt = '{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name']) candidate_urlpts = set() candidate_authors = [] res = urlopen(DBLPQuery.get_dblp_url(urlpt)) dom = BeautifulSoup(res) for cu_tag in dom.find_all('li', 'homonym'): cu = cu_tag.find('a')['href'][3:-5] candidate_urlpts.add(cu) if len(candidate_urlpts) == 0: candidate_urlpts.add(urlpt) for cu in candidate_urlpts: author = DBLPAuthor(cu) candidate_authors.append(author) print(cu) # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP if len(candidate_authors) == 0 and len(author_name) == 3: res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash']))) dom = BeautifulSoup(res) author_tags = dom.find_all('author') for author_tag in author_tags: if author_tag.string == author_name_comp['full_name_dash']: author = DBLPAuthor(author_tag['urlpt']) candidate_authors.append(author) try: target_author = candidate_authors[0] except IndexError: return { 'cdblp': author_cdblp.get_author(), 'dblp': {} } coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors())) coauthors_set_dblp = set(candidate_authors[0].get_coauthors()) coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp))) overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(candidate_authors) > 1: for candidate in candidate_authors: coauthors_set_dblp = set(candidate.get_coauthors()) coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(coauthor_overlap) >= coauthor_count_max: target_author = candidate overlap = coauthor_overlap else: target_author = candidate_authors[0] #print(overlap) if coauthor_count_max == 0: return { 'cdblp': author_cdblp.get_author(), 'dblp': {} } return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() }
def author_distinct(cached_list, cached_set, author_name): trial = author_name[0] if author_name in cached_set: d = DBLPQuery.get_cache('author-entries-cache.data') print('This is a CDBLP author w/ a English name on file.') author_name_zh = '' for author_name_comp in cached_list: if author_name_comp['full_name'] == author_name.strip() or author_name_comp['zh'] == author_name: author_name_zh = author_name_comp['zh'] break return { 'cdblp': d.get(author_name_zh, {}), 'dblp': DBLPAuthor('{}/{}:{}'.format(author_name_comp['last_name'][0].lower(), author_name_comp['last_name'], author_name_comp['first_name'])).get_author() } else: if 0x3400 < ord(trial) < 0x2b6f8: print('This is a CDBLP author w/ a Chinese name.') author_cdblp = CDBLPAuthor(author_name) author_name_comp = CDBLPAuthor.getEnglishName(author_name) res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name']))) dom = BeautifulSoup(res) candidate_authors = [] author_tags = dom.find_all('author') for author_tag in author_tags: if author_tag.string == author_name_comp['full_name']: author = DBLPAuthor(author_tag['urlpt']) candidate_authors.append(author) # for example, '骞雅楠' => 'Ya-nan Qian' in DBLP if len(candidate_authors) == 0 and len(author_name) == 3: res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name_comp['full_name_dash']))) dom = BeautifulSoup(res) author_tags = dom.find_all('author') for author_tag in author_tags: if author_tag.string == author_name_comp['full_name_dash']: author = DBLPAuthor(author_tag['urlpt']) candidate_authors.append(author) try: target_author = candidate_authors[0] except IndexError: return { 'cdblp': author_cdblp.get_author(), 'dblp': {} } coauthors_set_cdblp = set(map(lambda a: a['full_name'], author_cdblp.get_coauthors())) coauthors_set_dblp = set(candidate_authors[0].get_coauthors()) coauthor_count_max = len(set(coauthors_set_cdblp.intersection(coauthors_set_dblp))) overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(candidate_authors) > 1: for candidate in candidate_authors: coauthors_set_dblp = set(candidate.get_coauthors()) coauthor_overlap = coauthors_set_cdblp.intersection(coauthors_set_dblp) if len(coauthor_overlap) >= coauthor_count_max: target_author = candidate overlap = coauthor_overlap else: target_author = candidate_authors[0] #print(overlap) return { 'cdblp': author_cdblp.get_author(), 'dblp': target_author.get_author() } else: print('This is a non-CDBLP author.') candidates = [] res = urlopen('{}/search/author?xauthor={}'.format(DBLPQuery.base_url, quote(author_name))) dom = BeautifulSoup(res) for candidate_tag in dom.find_all('author'): if author_name == candidate_tag.string: candidates.append(DBLPAuthor(candidate_tag['urlpt'])) return { 'cdblp': {}, 'dblp': candidates[0].get_author() }
def run(self): CDBLPAuthor.parallel_get(self.journal, self.link) print(self.journal + ' is done.')