def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) with urllib.request.urlopen(list_url) as url: response = url.read() soup = BeautifulSoup(response, 'html.parser') papers_meta_list = soup.find('div', {'class', 'container-fluid'}).find_all('li') page_urls_list = [ base_url + m.find('a').get('href') for m in papers_meta_list ] titles_list = [str(m.find('a').string) for m in papers_meta_list] authors_list = [ format_authors(m.find('i').string) for m in papers_meta_list ] conf_date = dateutil.parser.parse(conf_id[-4:] + '-12') if (len(page_urls_list) == len(titles_list) and len(page_urls_list) == len(authors_list)): for i, page_url in enumerate(tqdm(page_urls_list)): pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): try: with urllib.request.urlopen(page_url) as url: response2 = url.read() soup2 = BeautifulSoup(response2, 'html.parser') pdf_url = soup2.find( 'div', {'class', 'container-fluid'}).find_all('a') pdf_url = [p.get('href') for p in pdf_url] pdf_url = base_url + [ p for p in pdf_url if p.lower().endswith('paper.pdf') ][0] summary = soup2.find( 'div', {'class', 'container-fluid'}).find_all('p')[-1] summary = flatten_content_list(summary.contents) print(titles_list[i]) print(authors_list[i]) print(page_urls_list[i]) print(pdf_url) print(summary) db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_url, conf_date, summary) except urllib.error.URLError: print('Skipping {:} - URLError'.format(page_url)) else: print('Skipping {:} - Exists'.format(page_url)) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d})'.format( len(page_urls_list), len(titles_list), len(authors_list)))
def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) with open(list_url, 'r') as f: response = f.read() soup = BeautifulSoup(response, 'html.parser') papers_meta_list = soup.find_all('div', {'class': 'myCard col-xs-6 col-md-4'}) page_urls_list = [m.find('a', {'class': 'text-muted'}).get('href') for m in papers_meta_list] titles_list = [m.find('h5', {'class': 'card-title'}).string.strip() for m in papers_meta_list] authors_list = [format_authors(m.find('h6', {'class': 'card-subtitle text-muted'}).string.strip()) for m in papers_meta_list] conf_year = conf_id[-4:] conf_date = dateutil.parser.parse(conf_year + '-09') # print(papers_meta_list[0]) # print(page_urls_list[0]) # print(pdf_urls_list[0]) # print(authors_list[0]) # print(titles_list[0]) if (len(titles_list) == len(authors_list) and len(titles_list) == len(page_urls_list)): for i in tqdm(range(len(titles_list))): pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): print(page_urls_list[i]) print(titles_list[i]) try: with urllib.request.urlopen(page_urls_list[i]) as url: response2 = url.read() soup2 = BeautifulSoup(response2, 'html.parser') pdf_url = soup2.find_all('a', {'class': 'btn btn-info btn-sm mt-1'}) pdf_url = [p.get('href') for p in pdf_url] pdf_url = [p for p in pdf_url if '/papers/' in p][0] summary = flatten_content_list(soup2.find('div', {'class': 'col-12 col-lg-8'}).find('p')) print(pdf_url) print(summary) db_manager.add_paper( conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_url, conf_date, summary) except urllib.error.URLError: print('Skipping {:} - URLError'.format(titles_list[i])) else: print('Skipping {:} - Exists'.format(titles_list[i])) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'.format( len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))
def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) with urllib.request.urlopen(list_url) as url: response = url.read() soup = BeautifulSoup(response, 'html.parser') papers_meta_list = get_meta_list(soup) titles_list = [str(m.contents[0]).strip() for m in papers_meta_list] authors_list = [ format_authors(m.find('i', {'class', 'tgp1-c9cr'})) for m in papers_meta_list ] pdf_urls_list = [ base_url + m.find('a').get('href') for m in papers_meta_list ] conf_year = conf_id[-4:] conf_date = dateutil.parser.parse(conf_year + '-09') workshop_istart = 309 # print(papers_meta_list[0]) # print(page_urls_list[0]) # print(pdf_urls_list[0]) # print(authors_list[0]) # print(titles_list[0]) if (len(titles_list) == len(authors_list) and len(titles_list) == len(pdf_urls_list)): for i in tqdm(range(len(titles_list))): if i == workshop_istart: conf_sub_id = 'Workshop' conf_name += '_workshop' pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): print(titles_list[i]) try: summary = '' page_url = '' db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_url, pdf_urls_list[i], conf_date, summary) except urllib.error.URLError: print('Skipping {:} - URLError'.format(titles_list[i])) else: print('Skipping {:} - Exists'.format(titles_list[i])) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'.format( len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))
def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) with urllib.request.urlopen(list_url) as url: response = url.read() soup = BeautifulSoup(response, 'html.parser') papers_meta_list1 = soup.find_all('dt') page_urls_list = [] for m in papers_meta_list1: link = m.find('a').get('href') tmp_base_url = base_url if not link.startswith('..'): base_parts = base_url.split('/') tmp_base_url = '/'.join(base_parts[:-2]) page_urls_list.append(tmp_base_url + link) titles_list = [str(m.find('a').string) for m in papers_meta_list1] papers_meta_list2 = soup.find_all('dd') authors_list = [format_authors(m) for m in papers_meta_list2[::2]] pdf_urls_list = [] for m in papers_meta_list2[1::2]: link = m.find('a').get('href') tmp_base_url = base_url if not link.startswith('..'): base_parts = base_url.split('/') tmp_base_url = '/'.join(base_parts[:-2]) pdf_urls_list.append(tmp_base_url + link) dates_list = [ format_date(m.find('div', {'class', 'bibref'})) for m in papers_meta_list2[1::2] ] if (len(page_urls_list) == len(pdf_urls_list) and len(page_urls_list) == len(authors_list) and len(page_urls_list) == len(titles_list)): for i in tqdm(range(len(page_urls_list))): pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): summary = get_abstract(page_urls_list[i]) print(summary) db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_urls_list[i], dates_list[i], summary) else: print('Skipping {:} - exists'.format(page_urls_list[i])) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'.format( len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))
def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) with urllib.request.urlopen(list_url) as url: response = url.read() soup = BeautifulSoup(response, 'html.parser') papers_meta_list = get_meta_list(soup) titles_list = [ flatten_content_list(m.find('span', {'class', 'title'}).contents) for m in papers_meta_list ] authors_list = [ format_authors(m.find('span', {'class', 'authors'})) for m in papers_meta_list ] page_urls_list = [ base_url + m.find('a').get('href') for m in papers_meta_list ] conf_year = conf_id[-4:] conf_date = dateutil.parser.parse(conf_year + '-09') # print(papers_meta_list[0]) # print(page_urls_list[0]) # print(pdf_urls_list[0]) # print(authors_list[0]) # print(titles_list[0]) if (len(titles_list) == len(authors_list) and len(titles_list) == len(page_urls_list)): for i, page_url in enumerate(tqdm(page_urls_list)): pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): print(page_url) paper_id = page_url.split('/')[-2] pdf_url = page_url.replace('index.html', paper_id + '.pdf') summary = get_summary(page_url) print(summary) db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_url, conf_date, summary) else: print('Skipping {:} - Exists'.format(page_url)) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'.format( len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))
def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) with urllib.request.urlopen(list_url) as url: response = url.read() soup = BeautifulSoup(response, 'html.parser') papers_meta_list = soup.find_all('div', {'class': 'paper'}) titles_list = [ flatten_content_list(m.find('p', {'class', 'title'}).contents) for m in papers_meta_list ] authors_list = [ format_authors(m.find('span', { 'class': 'authors' }).string) for m in papers_meta_list ] page_urls_list = [m.find_all('a')[0].get('href') for m in papers_meta_list] pdf_urls_list = [m.find_all('a')[1].get('href') for m in papers_meta_list] conf_year = conf_id[-4:] conf_date = dateutil.parser.parse(conf_year + '-06') # print(papers_meta_list[0]) # print(page_urls_list[0]) # print(pdf_urls_list[0]) # print(authors_list[0]) # print(titles_list[0]) if (len(page_urls_list) == len(authors_list) and len(page_urls_list) == len(pdf_urls_list) and len(page_urls_list) == len(titles_list)): for i, page_url in enumerate(tqdm(page_urls_list)): pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): try: print(page_url) with urllib.request.urlopen(page_url) as url: response2 = url.read() soup2 = BeautifulSoup(response2, 'html.parser') summary = flatten_content_list( soup2.find('div', { 'id': 'abstract' }).contents) print(summary) db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_urls_list[i], conf_date, summary) except urllib.error.URLError: print('Skipping {:} - URLError'.format(page_url)) else: print('Skipping {:} - Exists'.format(page_url)) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'.format( len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))
def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) with urllib.request.urlopen(list_url) as url: response = url.read() soup = BeautifulSoup(response, 'html.parser') titles_meta_list = soup.find_all('dt') full_page_urls_list = [ base_url + m.find('a').get('href') for m in titles_meta_list ] full_titles_list = [ str(m.find('a').string).replace('\n', '') for m in titles_meta_list ] authors_meta_list = soup.find_all('dd') full_authors_list = [format_authors(m) for m in authors_meta_list[::2]] full_pdf_urls_list = [ base_url + m.find('a').get('href') for m in authors_meta_list[1::2] ] full_pdf_urls_list = [ m for m in full_pdf_urls_list if m.endswith('.pdf') and not '-supp' in m ] authors_meta_list = soup.find_all('dd') conf_year = conf_id[-4:] conf_mask = [ True if 'eccv_' + conf_year in p.lower() else False for p in full_page_urls_list ] page_urls_list = [] titles_list = [] authors_list = [] pdf_urls_list = [] for i in range(len(conf_mask)): if conf_mask[i]: page_urls_list.append(full_page_urls_list[i]) titles_list.append(full_titles_list[i]) authors_list.append(full_authors_list[i]) pdf_urls_list.append(full_pdf_urls_list[i]) dates_list = [ dateutil.parser.parse('2020-08') for _ in range(len(page_urls_list)) ] if (len(page_urls_list) == len(pdf_urls_list) and len(page_urls_list) == len(authors_list) and len(page_urls_list) == len(titles_list)): for i in tqdm(range(len(page_urls_list))): pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): summary = get_abstract(page_urls_list[i]) print(titles_list[i]) print(summary) db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_urls_list[i], dates_list[i], summary) else: print('Skipping {:} - exists'.format(page_urls_list[i])) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'.format( len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))
def fetch_papers(db_manager: DBManager, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) conf_year = int(conf_id[-4:]) papers_meta_list = get_meta_list(conf_id, conf_sub_id, list_url) if conf_year in [2020, 2021]: titles_list = [ flatten_content_list(m.find_all('a')[0].contents) for m in papers_meta_list ] authors_list = [ format_authors( m.find('div', { 'class': 'note-authors' }).find_all('a'), conf_year) for m in papers_meta_list ] page_urls_list = [ m.find_all('a')[0].get('href') for m in papers_meta_list ] pdf_urls_list = [ m.find_all('a')[1].get('href') for m in papers_meta_list ] summary_list = [ extract_abstract(m, conf_year) for m in papers_meta_list ] if conf_year in [2019, 2018]: for i in range(len(papers_meta_list) - 1, -1, -1): if papers_meta_list[i].find('a', {'title': 'PDF'}) is None: del papers_meta_list[i] titles_list = [ flatten_content_list( m.find('div', {'class', 'maincardBody'}).contents) for m in papers_meta_list ] authors_list = [ format_authors( m.find('div', { 'class': 'maincardFooter' }).string, conf_year) for m in papers_meta_list ] page_urls_list = [ m.find('a', { 'title': 'PDF' }).get('href') for m in papers_meta_list ] elif conf_year in [2017]: if conf_sub_id.lower() == 'main': page_urls_list = [ 'https://openreview.net' + m.find_all('a')[0].get('href') for m in papers_meta_list ] else: page_urls_list = [ m.find_all('a')[0].get('href') for m in papers_meta_list ] titles_list = [ flatten_content_list(m.find_all('a')[0].contents) for m in papers_meta_list ] authors_list = [ format_authors( m.find('span', { 'class': 'signatures' }).find_all('a'), conf_year) for m in papers_meta_list ] elif conf_year in [2016, 2015]: page_urls_list = [ m.find('a', { 'class': 'urlextern' }).get('href') for m in papers_meta_list ] page_urls_list = [ p.replace('http://beta.openreview', 'https://openreview') for p in page_urls_list ] titles_list = [ flatten_content_list(m.find('a', { 'class': 'urlextern' }).contents) for m in papers_meta_list ] authors_list = [ format_authors( m.find('div', { 'class': 'li' }).contents[-1], conf_year) for m in papers_meta_list ] conf_date = dateutil.parser.parse(conf_id[-4:] + '-04') # print(papers_meta_list[0]) # print(page_urls_list[0]) # print(authors_list[0]) # print(titles_list[0]) if (len(page_urls_list) == len(authors_list) and \ len(page_urls_list) == len(titles_list)): for i, page_url in enumerate(tqdm(page_urls_list)): if len(page_url) < 2: continue pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): try: print(page_url) if conf_year in [2020, 2021]: pdf_url = pdf_urls_list[i] summary = summary_list[i] else: with urllib.request.urlopen(page_url) as url: response2 = url.read() soup2 = BeautifulSoup(response2, 'html.parser') try: if conf_year in [ 2019, 2018, 2017 ] or (conf_year == 2016 and conf_sub_id.lower() == 'workshop'): pdf_url = 'https://openreview.net' + soup2.find( 'a', { 'class': 'note_content_pdf' }).get('href') summary = flatten_content_list( soup2.find('span', {'class', 'note-content-value' }).contents) if len(summary) < 10: summary = flatten_content_list( soup2.find_all( 'span', {'class', 'note-content-value' })[1].contents) elif conf_year in [2016, 2015]: pdf_url = 'https://arxiv.org' + soup2.find( 'a', { 'accesskey': 'f' }).get('href') summary = flatten_content_list( soup2.find('blockquote', {'class', 'abstract mathjax' }).contents[1:]) except AttributeError: # Some links don't have PDF continue print(pdf_url) print(summary) db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_url, conf_date, summary) except urllib.error.URLError: print('Skipping {:} - URLError'.format(page_url)) print(titles_list[i]) else: print('Skipping {:} - Exists'.format(page_url)) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'.format( len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))
def fetch_papers(db_manager: DBManager, base_url: str, list_url: str, conf_id: str, conf_sub_id: str, conf_name: str) -> None: """ Fetches the data of all the papers found at list_url and add them to the database, if the data is valid. """ print(conf_name) print(conf_id, conf_sub_id) print(list_url) conf_year = int(conf_id[-4:]) with urllib.request.urlopen(list_url) as url: response = url.read() soup = BeautifulSoup(response, 'html.parser') if conf_year in [2020]: contents_list = soup.find_all('a') issues_links = [c.get('href') for c in contents_list] issues_links = [ i for i in issues_links if i.startswith('aaai20contents-issue') and not '#' in i ] issues_links = issues_links[:-1] # Remove student papers else: issues_links = [''] for iss_link in issues_links: if len(iss_link) > 0: issue_url = base_url + 'AAAI/' + iss_link with urllib.request.urlopen(issue_url) as url: response = url.read() # with open('response_tmp2.pickle', 'wb') as f: # pickle.dump(response, f) # with open('response_tmp2.pickle', 'rb') as f: # response = pickle.load(f) soup = BeautifulSoup(response, 'html.parser') papers_meta_list = soup.find_all('p', {'class': 'left'}) page_urls_list = [m.find('a').get('href') for m in papers_meta_list] authors_list = [m.find('i') for m in papers_meta_list] # None may happen because some authors are not inside <i> for i, a in enumerate(authors_list): if a is None: authors_list[i] = papers_meta_list[i].contents[-1].strip( ).replace('\n', '') else: authors_list[i] = a.string authors_list = [format_authors(a) for a in authors_list] titles_list = [ flatten_content_list(m.find('a').contents) for m in papers_meta_list ] conf_date = dateutil.parser.parse(conf_id[-4:] + '-02') if (len(page_urls_list) == len(authors_list) and len(page_urls_list) == len(titles_list)): for i, page_url in enumerate(tqdm(page_urls_list)): pid = db_manager.create_paper_id(conf_id, conf_sub_id, titles_list[i]) if not db_manager.exists(pid): try: if not conf_year in [2020, 2019]: page_url = page_url.replace( '/view/', '/viewPaper/') page_url = page_url.replace('http://', 'https://') print(page_url) with urllib.request.urlopen(page_url) as url: response2 = url.read() soup2 = BeautifulSoup(response2, 'html.parser') if conf_year in [2020, 2019]: try: pdf_url = soup2.find('a', { 'class': 'obj_galley_link' }).get('href') except AttributeError: # This error happens when a paper does not have a correct page for some reason continue summary = soup2.find('div', {'class': 'item abstract'}) if summary is None: summary = '' else: summary = flatten_content_list( summary.find('p').contents) elif conf_year in [2018, 2017, 2016, 2015]: try: pdf_url = soup2.find('div', { 'id': 'paper' }).find('a').get('href') summary = flatten_content_list( soup2.find('div', { 'id': 'abstract' }).find('div').contents) except AttributeError: # Some links are wrong in the site continue print(pdf_url) print(summary) db_manager.add_paper(conf_id, conf_sub_id, conf_sub_id.lower() != 'main', conf_name, titles_list[i], authors_list[i], page_urls_list[i], pdf_url, conf_date, summary) except urllib.error.URLError: print('Skipping {:} - URLError'.format(page_url)) else: print('Skipping {:} - Exists'.format(page_url)) else: print('SKIPPING!!! Wrong list sizes. ({:d}, {:d}, {:d}, {:d})'. format(len(page_urls_list), len(pdf_urls_list), len(authors_list), len(titles_list)))