def __download_from_springer(url,
                             save_dir,
                             year,
                             is_workshops=False,
                             time_sleep_in_seconds=5,
                             downloader='IDM'):
    downloader = Downloader(downloader)
    for i in range(3):
        try:
            papers_dict = springer.get_paper_name_link_from_url(url)
            break
        except Exception as e:
            print(str(e))
    # total_paper_number = len(papers_dict)
    pbar = tqdm(papers_dict.keys())
    postfix = f'ECCV_{year}'
    if is_workshops:
        postfix = f'ECCV_WS_{year}'

    for name in pbar:
        pbar.set_description(f'Downloading paper {name}')
        if not os.path.exists(os.path.join(save_dir, f'{name}_{postfix}.pdf')):
            downloader.download(
                papers_dict[name],
                os.path.join(save_dir, f'{name}_{postfix}.pdf'),
                time_sleep_in_seconds)
Example #2
0
class Analytics:
    def __init__(self, config):
        self.config = config

        self.downloader = Downloader(config)
        self.parser = Parser(config)
        self.analyser = Analyser(config)

    def generate_report(self):
        if self.config.download:
            self.downloader.download()
        self.parser.parse()
        self.analyser.analyse()
        return self.config
Example #3
0
class Analytics:
    def __init__(self, config):
        self.config = config

        self.downloader = Downloader(config)
        self.parser = Parser(config)
        self.analyser = Analyser(config)

    def download_logs(self):
        print("here")
        if self.config.download:
            self.downloader.download()

    def generate_report(self):
        #if self.config.download:
        #    self.downloader.download()
        self.parser.parse()
        self.analyser.analyse()
        return self.config
def download_iclr_paper_given_html_file(year, html_path, save_dir, time_step_in_seconds=10, downloader='IDM'):
    """
    download iclr conference paper given html file (current only support 2021)
    :param year: int, iclr year, current only support year >= 2018
    :param html_path: str, html file's full pathname
    :param save_dir: str, paper save path
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    base_url = f'https://openreview.net/group?id=ICLR.cc/{year}'
    content = open(html_path, 'rb').read()
    soup = BeautifulSoup(content, 'html5lib')
    divs = soup.find('div', {'class': 'tabs-container'})
    oral_papers = divs.find('div', {'id': 'oral-presentations'}).find_all('li', {'class': 'note'})
    num_oral_papers = len(oral_papers)
    print('found number of oral papers:', num_oral_papers)

    spotlight_papers = divs.find('div', {'id': 'spotlight-presentations'}).find_all('li', {'class': 'note'})
    num_spotlight_papers = len(spotlight_papers)
    print('found number of spotlight papers:', num_spotlight_papers)

    poster_papers = divs.find('div', {'id': 'poster-presentations'}).find_all('li', {'class': 'note'})
    num_poster_papers = len(poster_papers)
    print('found number of poster papers:', num_poster_papers)

    paper_postfix = f'ICLR_{year}'
    error_log = []

    # oral
    oral_save_dir = os.path.join(save_dir, 'oral')
    print('downloading oral papers...........')
    os.makedirs(oral_save_dir, exist_ok=True)
    for index, paper in enumerate(oral_papers):
        a_hrefs = paper.find_all("a")
        name = slugify(a_hrefs[0].text.strip())
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(oral_save_dir, pdf_name)):
            link = a_hrefs[1].get('href')
            link = urllib.parse.urljoin(base_url, link)
            print('Downloading paper {}/{}: {}'.format(index + 1, num_oral_papers, name))
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(oral_save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                        )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
                    # time.sleep(time_step_in_seconds)
            if not success_flag:
                error_log.append((name, link))

    # spotlight
    spotlight_save_dir = os.path.join(save_dir, 'spotlight')
    print('downloading spotlight papers...........')
    os.makedirs(spotlight_save_dir, exist_ok=True)
    for index, paper in enumerate(spotlight_papers):
        a_hrefs = paper.find_all("a")
        name = slugify(a_hrefs[0].text.strip())
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(spotlight_save_dir, pdf_name)):
            link = a_hrefs[1].get('href')
            link = urllib.parse.urljoin(base_url, link)
            print('Downloading paper {}/{}: {}'.format(index + 1, num_spotlight_papers, name))
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(spotlight_save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                    )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
                    # time.sleep(time_step_in_seconds)
            if not success_flag:
                error_log.append((name, link))

    # poster
    poster_save_dir = os.path.join(save_dir, 'poster')
    print('downloading poster papers...........')
    os.makedirs(poster_save_dir, exist_ok=True)
    for index, paper in enumerate(poster_papers):
        a_hrefs = paper.find_all("a")
        name = slugify(a_hrefs[0].text.strip())
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(poster_save_dir, pdf_name)):
            link = a_hrefs[1].get('href')
            link = urllib.parse.urljoin(base_url, link)
            print('Downloading paper {}/{}: {}'.format(index + 1, num_poster_papers, name))
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(poster_save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                    )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
                    # time.sleep(time_step_in_seconds)
            if not success_flag:
                error_log.append((name, link))



    # 2. write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                f.write(e)
                f.write('\n')
            f.write('\n')
def download_iclr_paper(year, save_dir, time_step_in_seconds=5, downloader='IDM', is_use_arxiv_mirror=False):
    """
    download iclr conference paper for year 2014, 2015 and 2016
    :param year: int, iclr year
    :param save_dir: str, paper save path
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    paper_postfix = f'ICLR_{year}'
    if year == 2016:
        base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2016:main.html'
    elif year == 2015:
        base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2015:main.html'
    elif year == 2014:
        base_url = 'https://iclr.cc/archive/2014/conference-proceedings/'
    else:
        raise ValueError('the website url is not given for this year!')
    os.makedirs(save_dir, exist_ok=True)
    if year == 2015:  # oral and poster seperated
        oral_save_path = os.path.join(save_dir, 'oral')
        poster_save_path = os.path.join(save_dir, 'poster')
        workshop_save_path = os.path.join(save_dir, 'ws')
        os.makedirs(oral_save_path, exist_ok=True)
        os.makedirs(poster_save_path, exist_ok=True)
        os.makedirs(workshop_save_path, exist_ok=True)
    if os.path.exists(f'..\\urls\\init_url_iclr_{year}.dat'):
        with open(f'..\\urls\\init_url_iclr_{year}.dat', 'rb') as f:
            content = pickle.load(f)
    else:
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=base_url, headers=headers)
        content = urllib.request.urlopen(req).read()
        with open(f'..\\urls\\init_url_iclr_{year}.dat', 'wb') as f:
            pickle.dump(content, f)
    error_log = []
    soup = BeautifulSoup(content, 'html.parser')
    print('open url successfully!')
    if year == 2016:
        papers = soup.find('h3', {'id': 'accepted_papers_conference_track'}).findNext('div').find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, title+f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                                urls=pdf_link,
                                save_path=os.path.join(save_dir, pdf_name),
                                time_sleep_in_seconds=time_step_in_seconds
                            )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
        # workshops
        papers = soup.find('h3', {'id': 'workshop_track_posters_may_2nd'}).findNext('div').find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://beta.openreview'):
                title = slugify(paper.text)
                pdf_name = f'{title}_ICLR_WS_{year}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)):
                        pdf_link = get_pdf_link_from_openreview(link)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(save_dir, 'ws', pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds
                        )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
        papers = soup.find('h3', {'id': 'workshop_track_posters_may_3rd'}).findNext('div').find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://beta.openreview'):
                title = slugify(paper.text)
                pdf_name = f'{title}_ICLR_WS_{year}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)):
                        pdf_link = get_pdf_link_from_openreview(link)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(save_dir, 'ws', pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds
                        )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
    elif year == 2015:
        # oral papers
        oral_papers = soup.find('h3', {'id': 'conference_oral_presentations'}).findNext('div').find_all('a')
        for paper in tqdm(oral_papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(oral_save_path, title+f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(oral_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds
                        )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))

        # workshops papers
        workshop_papers = soup.find('h3', {'id': 'may_7_workshop_poster_session'}).findNext('div').find_all('a')
        workshop_papers.append(
            soup.find('h3', {'id': 'may_8_workshop_poster_session'}).findNext('div').find_all('a'))
        for paper in tqdm(workshop_papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_ICLR_WS_{year}.pdf'
                try:
                    if not os.path.exists(os.path.join(workshop_save_path, title + f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(workshop_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
        # poster papers
        poster_papers = soup.find('h3', {'id': 'may_9_conference_poster_session'}).findNext('div').find_all('a')
        for paper in tqdm(poster_papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(poster_save_path, title + f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(poster_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
    elif year == 2014:
        papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, pdf_name)):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(save_dir, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))

        # workshops
        paper_postfix = f'ICLR_WS_{year}'
        base_url = 'https://sites.google.com/site/representationlearning2014/workshop-proceedings'
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=base_url, headers=headers)
        content = urllib.request.urlopen(req).read()
        soup = BeautifulSoup(content, 'html.parser')
        workshop_save_path = os.path.join(save_dir, 'WS')
        os.makedirs(workshop_save_path, exist_ok=True)
        papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(workshop_save_path, pdf_name)):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(workshop_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))


    # write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')

            f.write('\n')
    return True
def download_iclr_spotlight_papers(save_dir, driver_path, year, base_url=None, time_step_in_seconds=10, downloader='IDM'):
    """

    :param save_dir: str, paper save path
    :param driver_path: str, 'chromedriver.exe' full pathname
    :param year: int, iclr year, current only support year >= 2018
    :param base_url: str, paper website url
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return:
    """
    downloader = Downloader(downloader=downloader)
    if base_url is None:
        if year >= 2021:
            base_url = 'https://openreview.net/group?id=ICLR.cc/2021/Conference#spotlight-presentations'
        elif year == 2020:
            base_url = 'https://openreview.net/group?id=ICLR.cc/2020/Conference#accept-spotlight'
        else:
            raise ValueError('the website url is not given for this year!')
    first_poster_index = {'2017': 15}
    paper_postfix = f'ICLR_{year}'
    error_log = []
    driver = webdriver.Chrome(driver_path)
    driver.get(base_url)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # wait for the select element to become visible
    print('Starting web driver wait...')
    wait = WebDriverWait(driver, 20)
    print('Starting web driver wait... finished')
    res = wait.until(EC.presence_of_element_located((By.ID, "notes")))
    print("Successful load the website!->",res)
    res = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "note")))
    print("Successful load the website notes!->",res)
    # parse the results

    if year >= 2021:
        divs = driver.find_elements_by_xpath('//*[@id="spotlight-presentations"]/ul/li')
    elif year == 2020:
        divs = driver.find_elements_by_xpath('//*[@id="accept-spotlight"]/ul/li')
    else:
        divs = driver.find_elements_by_class_name('note')[:first_poster_index[str(year)]]
    num_papers = len(divs)
    print('found number of papers:',num_papers)
    for index, paper in enumerate(divs):
        a_hrefs = paper.find_elements_by_tag_name("a")
        if year >= 2018:
            name = slugify(a_hrefs[0].text.strip())
            link = a_hrefs[1].get_attribute('href')
        else:
            name = slugify(paper.find_element_by_class_name('note_content_title').text)
            link = paper.find_element_by_class_name('note_content_pdf').get_attribute('href')
        print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name))
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(save_dir, pdf_name)):
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                        )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
            if not success_flag:
                error_log.append((name, link))
    driver.close()
    # 2. write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                f.write(e)
                f.write('\n')
            f.write('\n')
Example #7
0
def main(argv):
    config = configparser.ConfigParser()
    config_path = os.path.expanduser(os.path.join("~", ".doimgrrc"))
    if os.path.isfile(config_path):
        config.read(config_path)

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Command line based tool to request DOI data and convert \
it to BibTex entries.')
    subparsers = parser.add_subparsers()

    parser_search = subparsers.add_parser(
        'search',
        help='Search database for published articles to find relevant DOIs',
        description="""Searches database for published articles. This can be
used to find a specific DOI or getting information about a keyword/topic.""")
    parser_search.add_argument('query', type=str, help='search string')
    parser_search.add_argument(
        '--show-authors',
        action='store_true',
        default=config.getboolean('search', 'show-authors', fallback=False),
        help='if set additional author information is shown')
    parser_search.add_argument(
        '--show-type',
        action='store_true',
        default=config.getboolean('search', 'show-type', fallback=False),
        help='if set additional information about the type is shown')
    parser_search.add_argument(
        '--show-publisher',
        action='store_true',
        default=config.getboolean('search', 'show-publisher', fallback=False),
        help='if set additional information about the publisher is shown')
    parser_search.add_argument('--show-url',
                               action='store_true',
                               default=config.getboolean('search',
                                                         'show-url',
                                                         fallback=False),
                               help='if set a URL to the document is shown')
    allowed_sort_types = [
        'score', 'updated', 'deposited', 'indexed', 'published'
    ]
    parser_search.add_argument('--sort', type=str, choices=allowed_sort_types,
        default=config.get('search', 'sort', fallback='score'),
        help='sorting of search queries; allowed values are {}'\
            .format(", ".join(allowed_sort_types)), metavar='')
    parser_search.add_argument('--order',
                               type=str,
                               choices=['asc', 'desc'],
                               default=config.get('search',
                                                  'order',
                                                  fallback='desc'),
                               help='ordering of search queries')
    parser_search.add_argument('--year',
                               type=int,
                               default=config.getint('search',
                                                     'year',
                                                     fallback=None),
                               help='limit the year')
    parser_search.add_argument('--rows',
                               type=int,
                               default=config.getint('search',
                                                     'rows',
                                                     fallback=20),
                               help='number of rows to load')
    parser_search.add_argument('--color',
                               action="store_true",
                               default=config.getboolean('search',
                                                         'color',
                                                         fallback=False),
                               help='if set, colored output is used')
    valid_colors = [
        'black', 'cyan', 'magenta', 'yellow', 'blue', 'green', 'red', 'white'
    ]
    parser_search.add_argument('--color-doi',
                               type=str,
                               default=config.get('search',
                                                  'color-doi',
                                                  fallback='red'),
                               choices=valid_colors,
                               help='color for DOIs')
    parser_search.add_argument('--color-title',
                               type=str,
                               default=config.get('search',
                                                  'color-title',
                                                  fallback='green'),
                               choices=valid_colors,
                               help='color for titles')
    parser_search.add_argument('--color-more',
                               type=str,
                               default=config.get('search',
                                                  'color-more',
                                                  fallback='blue'),
                               choices=valid_colors,
                               help='color for additional information such as \
authors, URLs, etc.')

    # receive allowed types via http://api.crossref.org/types
    allowed_types = api.get_valid_types()
    parser_search.add_argument('--type', type=str, choices=allowed_types,
        default=config.get('search', 'type', fallback=None),
        help='selects a single type; allowed values are {}'.format(", "\
            .join(allowed_types)),
        metavar='')
    parser_search.set_defaults(which_parser='search')

    parser_cite = subparsers.add_parser(
        'cite',
        help='Cite article based on DOI in different citation formats',
        description="""Cite articles with a known DOI. Formatting can be done
using the `style`-parameter and supports hundreds of different citation
formats. A full list of supported formats can be found in the subfolder
`API/styles.txt`. The most common ones are `apa` and `bibtex`.""")
    parser_cite.add_argument('identifier', type=str, help='DOI identifier')
    parser_cite.add_argument('-s',
                             '--style',
                             type=str,
                             default=config.get('cite',
                                                'style',
                                                fallback="bibtex"),
                             help='Citation style')
    parser_cite.add_argument(
        '-c',
        '--copy',
        action='store_true',
        default=config.get('cite', 'copy', fallback=False),
        help="""Copies the result to the system clipboard""")
    parser_cite.set_defaults(which_parser='cite')

    parser_download = subparsers.add_parser(
        'download',
        help='Download articles based on their DOI',
        description="""Downloads articles, if a full text verison is provided
by the authors.""")
    parser_download.add_argument('identifier', type=str, help='DOI identifier')
    parser_download.add_argument('-d',
                                 '--destination',
                                 type=str,
                                 default=config.get('download',
                                                    'destination',
                                                    fallback="."),
                                 help='download destination')
    parser_download.set_defaults(which_parser='download')

    parser_bulk = subparsers.add_parser(
        'bulk',
        help='Mass converting for multiple DOIs listed in a single file.',
        description=
        """Mass converting for multiple DOIs listed in a single file.""")
    parser_bulk.add_argument('input',
                             type=argparse.FileType('r'),
                             help='input file path',
                             nargs='?',
                             default=sys.stdin)
    parser_bulk.add_argument('output',
                             type=argparse.FileType('w'),
                             help='output file path',
                             nargs='?',
                             default=sys.stdout)
    parser_bulk.add_argument('-s',
                             '--style',
                             type=str,
                             default=config.get('bulk',
                                                'style',
                                                fallback="bibtex"),
                             help='Citation style')
    parser_bulk.set_defaults(which_parser='bulk')

    parser_service = subparsers.add_parser(
        'service',
        help='Provices service functions for the API such as rebuilding the \
database of valid types and styles',
        description="""Provices service functions for the API such as
rebuilding the database of valid types and styles""")
    parser_service.add_argument(
        '--rebuild-api-types',
        action='store_true',
        help='Rebuild the types, that are accepted on API requests')
    parser_service.add_argument(
        '--rebuild-api-styles',
        action='store_true',
        help='Rebuild the styles, that are accepted on API requests')
    parser_service.set_defaults(which_parser='service')

    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true',
        default=config.getboolean('general', 'quiet', fallback=False),
        help='turns off all unnecessary outputs; use this for scripting')
    parser.add_argument('--log-level',
                        type=str,
                        choices=['info', 'debug'],
                        default=config.get('general',
                                           'log-level',
                                           fallback="info"),
                        help='set the logging level')
    parser.add_argument('--version',
                        action="store_true",
                        help='shows the version of doimgr')

    args = parser.parse_args()

    if args.version:
        print("doimgr version: {}".format(__version__))
        sys.exit()

    # set the logging levels according to the users choice
    if args.quiet:
        level = logging.CRITICAL
    else:
        level = logging.INFO
        if args.log_level == 'debug':
            level = logging.DEBUG
    logging.basicConfig(level=level)

    logging.debug("doimgr version {}".format(__version__))

    if hasattr(args, 'which_parser'):
        if args.which_parser == 'search':
            logging.debug('Arguments match to perform search')
            req = Request()
            if sys.stdout.isatty():
                # only allow colors when the script's output is not redirected
                req.set_colored_output(args.color,
                                       doi=args.color_doi,
                                       title=args.color_title,
                                       more=args.color_more)
            else:
                logging.debug('Colors have been disabled due to detected \
reconnect')
            results = req.search(
                req.prepare_search_query(args.query, args.sort, args.order,
                                         args.year, args.type, args.rows))
            req.print_search_content(results, args.show_authors,
                                     args.show_type, args.show_publisher,
                                     args.show_url)

        elif args.which_parser == 'cite':
            logging.debug('Arguments match to request single DOI')

            # check if given style is valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            req = Request()
            result = req.citation(req.prepare_citation_query(args.identifier),
                                  style=args.style)
            req.print_citation(result)
            if args.copy:
                Clipboard.copy_to(result)

        elif args.which_parser == 'download':
            logging.debug('Arguments match to download single DOI')

            try:
                os.makedirs(os.path.expanduser(args.destination))
                logging.debug("Destination dir {} created.".format(
                    args.destination))
            except FileExistsError:
                logging.debug("Destination dir {} does already exists".format(
                    args.destination))

            req = Request()
            links = req.get_download_links(args.identifier)
            for link in links:
                url = link.get_url()
                d = Downloader()
                filepath = d.download(
                    url, os.path.expanduser(args.destination),
                    "{}.pdf".format(args.identifier.replace("/", "_")))
                if filepath is not None:
                    logging.info("Saved file as {}".format(filepath))

            if len(links) == 0:
                logging.info("No valid download URLs found. Aborting.")

        elif args.which_parser == 'bulk':
            logging.debug('Arguments match with bulk conversion')

            # check if given style valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            b = BulkConverter()
            if args.output == sys.stdout:
                # switch to quiet mode, since we do not want to place
                # unneccesary messages on stdout
                logging.getLogger().setLevel(logging.CRITICAL)
            b.run(args.input, args.output, style=args.style)

        elif args.which_parser == 'service':
            logging.debug('Arguments match with service call')

            if args.rebuild_api_types:
                api.rebuild_valid_identifier(api.TYPE_TYPES)

            if args.rebuild_api_styles:
                api.rebuild_valid_identifier(api.TYPE_STYLES)
def download_paper(volumn,
                   save_dir,
                   time_step_in_seconds=5,
                   downloader='IDM',
                   url=None,
                   is_use_url=False):
    """
    download all JMLR paper files given volumn and restore in save_dir
    respectively
    :param volumn: int, JMLR volumn, such as 2019
    :param save_dir: str, paper and supplement material's save path
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :param url: None or str, None means to download volumn papers.
    :param is_use_url: bool, if to download papers from 'url'. url couldn't be None when is_use_url is True.
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    # create current dict
    title_list = []
    # paper_dict = dict()

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }
    if not is_use_url:
        init_url = f'http://jmlr.org/papers/v{volumn}/'
        postfix = f'JMLR_v{volumn}'
        if os.path.exists(f'..\\urls\\init_url_JMLR_v{volumn}.dat'):
            with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'rb') as f:
                content = pickle.load(f)
        else:
            req = urllib.request.Request(url=init_url, headers=headers)
            content = urllib.request.urlopen(req, timeout=10).read()
            # content = open(f'..\\JMLR_{volumn}.html', 'rb').read()
            with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'wb') as f:
                pickle.dump(content, f)
    elif url is not None:
        req = urllib.request.Request(url=url, headers=headers)
        content = urllib.request.urlopen(req, timeout=10).read()
        postfix = f'JMLR'
    else:
        raise ValueError(
            ''''url' could not be None when 'is_use_url'=True!!!''')
    # soup = BeautifulSoup(content, 'html.parser')
    soup = BeautifulSoup(content, 'html5lib')
    # soup = BeautifulSoup(open(r'..\JMLR_2011.html', 'rb'), 'html.parser')
    error_log = []
    os.makedirs(save_dir, exist_ok=True)

    if (not is_use_url) and volumn <= 4:
        paper_list = soup.find('div', {'id': 'content'}).find_all('tr')
    else:
        paper_list = soup.find('div', {'id': 'content'}).find_all('dl')
    # num_download = 5 # number of papers to download
    num_download = len(paper_list)
    for paper in tqdm(zip(paper_list, range(num_download))):
        # get title
        print('\n')
        this_paper = paper[0]
        title = slugify(this_paper.find('dt').text)
        try:
            print('Downloading paper {}/{}: {}'.format(paper[1] + 1,
                                                       num_download, title))
        except:
            print(title.encode('utf8'))
        title_list.append(title)

        this_paper_main_path = os.path.join(
            save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
        if os.path.exists(this_paper_main_path):
            continue

        # get abstract page url
        links = this_paper.find_all('a')
        main_link = None
        for link in links:
            if '[pdf]' == link.text or 'pdf' == link.text:
                main_link = urllib.parse.urljoin('http://jmlr.org',
                                                 link.get('href'))
                break

        # try 1 time
        # error_flag = False
        for d_iter in range(1):
            try:
                # download paper with IDM
                if not os.path.exists(
                        this_paper_main_path) and main_link is not None:
                    downloader.download(
                        urls=main_link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
            except Exception as e:
                # error_flag = True
                print('Error: ' + title + ' - ' + str(e))
                error_log.append(
                    (title, main_link, 'main paper download error', str(e)))

    # store the results
    # 1. store in the pickle file
    # with open(f'{postfix}_pre.dat', 'wb') as f:
    #     pickle.dump(paper_dict, f)

    # 2. write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')

            f.write('\n')
Example #9
0
def main(argv):
    config = configparser.ConfigParser()
    config_path = os.path.expanduser(os.path.join("~", ".doimgrrc"))
    if os.path.isfile(config_path):
        config.read(config_path)

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Command line based tool to request DOI data and convert \
it to BibTex entries.')
    subparsers = parser.add_subparsers()

    parser_search = subparsers.add_parser('search', 
        help='Search database for published articles to find relevant DOIs',
        description="""Searches database for published articles. This can be
used to find a specific DOI or getting information about a keyword/topic.""")
    parser_search.add_argument('query', type=str, help='search string')
    parser_search.add_argument('--show-authors', action='store_true',
        default=config.getboolean('search', 'show-authors', fallback=False),
        help='if set additional author information is shown')
    parser_search.add_argument('--show-type', action='store_true',
        default=config.getboolean('search', 'show-type', fallback=False),
        help='if set additional information about the type is shown')
    parser_search.add_argument('--show-publisher', action='store_true',
        default=config.getboolean('search', 'show-publisher', fallback=False),
        help='if set additional information about the publisher is shown')
    parser_search.add_argument('--show-url', action='store_true',
        default=config.getboolean('search', 'show-url', fallback=False),
        help='if set a URL to the document is shown')
    allowed_sort_types=['score', 'updated', 'deposited', 'indexed', 'published']
    parser_search.add_argument('--sort', type=str, choices=allowed_sort_types,
        default=config.get('search', 'sort', fallback='score'),
        help='sorting of search queries; allowed values are {}'\
            .format(", ".join(allowed_sort_types)), metavar='')
    parser_search.add_argument('--order', type=str,
        choices=['asc', 'desc'],
        default=config.get('search', 'order', fallback='desc'),
        help='ordering of search queries')
    parser_search.add_argument('--year', type=int,
        default=config.getint('search', 'year', fallback=None),
        help='limit the year')
    parser_search.add_argument('--rows', type=int,
        default=config.getint('search', 'rows', fallback=20),
        help='number of rows to load')
    parser_search.add_argument('--color', action="store_true",
        default=config.getboolean('search', 'color', fallback=False),
        help='if set, colored output is used')
    valid_colors = ['black', 'cyan', 'magenta', 'yellow', 'blue', 'green',
            'red', 'white']
    parser_search.add_argument('--color-doi', type=str,
        default=config.get('search', 'color-doi', fallback='red'),
        choices=valid_colors, help='color for DOIs')
    parser_search.add_argument('--color-title', type=str,
        default=config.get('search', 'color-title', fallback='green'),
        choices=valid_colors, help='color for titles')
    parser_search.add_argument('--color-more', type=str,
        default=config.get('search', 'color-more', fallback='blue'),
        choices=valid_colors, help='color for additional information such as \
authors, URLs, etc.')

    # receive allowed types via http://api.crossref.org/types
    allowed_types = api.get_valid_types()
    parser_search.add_argument('--type', type=str, choices=allowed_types,
        default=config.get('search', 'type', fallback=None),
        help='selects a single type; allowed values are {}'.format(", "\
            .join(allowed_types)),
        metavar='')
    parser_search.set_defaults(which_parser='search')

    parser_cite = subparsers.add_parser('cite',
        help='Cite article based on DOI in different citation formats', 
        description="""Cite articles with a known DOI. Formatting can be done
using the `style`-parameter and supports hundreds of different citation
formats. A full list of supported formats can be found in the subfolder
`API/styles.txt`. The most common ones are `apa` and `bibtex`.""")
    parser_cite.add_argument('identifier', type=str, help='DOI identifier')
    parser_cite.add_argument('-s', '--style', type=str,
        default=config.get('cite', 'style', fallback="bibtex"),
        help='Citation style')
    parser_cite.add_argument('-c', '--copy', action='store_true',
        default=config.get('cite', 'copy', fallback=False),
        help="""Copies the result to the system clipboard""")
    parser_cite.set_defaults(which_parser='cite')

    parser_download = subparsers.add_parser('download',
        help='Download articles based on their DOI', 
        description="""Downloads articles, if a full text verison is provided
by the authors.""")
    parser_download.add_argument('identifier', type=str, help='DOI identifier')
    parser_download.add_argument('-d', '--destination', type=str,
        default=config.get('download', 'destination', fallback="."),
        help='download destination')
    parser_download.set_defaults(which_parser='download')

    parser_bulk = subparsers.add_parser('bulk',
        help='Mass converting for multiple DOIs listed in a single file.',
        description="""Mass converting for multiple DOIs listed in a single file.""")
    parser_bulk.add_argument('input', type=argparse.FileType('r'), 
        help='input file path', nargs='?', default=sys.stdin)
    parser_bulk.add_argument('output', type=argparse.FileType('w'),
        help='output file path', nargs='?', default=sys.stdout)
    parser_bulk.add_argument('-s', '--style', type=str,
        default=config.get('bulk', 'style', fallback="bibtex"),
        help='Citation style')
    parser_bulk.set_defaults(which_parser='bulk')

    parser_service = subparsers.add_parser('service',
        help='Provices service functions for the API such as rebuilding the \
database of valid types and styles', 
        description="""Provices service functions for the API such as
rebuilding the database of valid types and styles""")
    parser_service.add_argument('--rebuild-api-types', action='store_true',
            help='Rebuild the types, that are accepted on API requests')
    parser_service.add_argument('--rebuild-api-styles', action='store_true',
            help='Rebuild the styles, that are accepted on API requests')
    parser_service.set_defaults(which_parser='service')

    parser.add_argument('-q', '--quiet', action='store_true', 
        default=config.getboolean('general', 'quiet', fallback=False),
        help='turns off all unnecessary outputs; use this for scripting')
    parser.add_argument('--log-level', type=str, choices=['info', 'debug'],
        default=config.get('general', 'log-level', fallback="info"),
        help='set the logging level')
    parser.add_argument('--version', action="store_true",
        help='shows the version of doimgr')

    args = parser.parse_args()

    if args.version:
        print("doimgr version: {}".format(__version__))
        sys.exit()

    # set the logging levels according to the users choice
    if args.quiet:
        level = logging.CRITICAL
    else:
        level = logging.INFO
        if args.log_level == 'debug':
            level = logging.DEBUG
    logging.basicConfig(level=level)

    logging.debug("doimgr version {}".format(__version__))

    if hasattr(args, 'which_parser'):
        if args.which_parser == 'search':
            logging.debug('Arguments match to perform search')
            req = Request()
            if sys.stdout.isatty():
                # only allow colors when the script's output is not redirected
                req.set_colored_output(args.color, doi=args.color_doi,
                        title=args.color_title, more=args.color_more)
            else:
                logging.debug('Colors have been disabled due to detected \
reconnect')
            results = req.search(req.prepare_search_query(args.query,
                args.sort, args.order, args.year, args.type, args.rows))
            req.print_search_content(results, args.show_authors,
                    args.show_type, args.show_publisher, args.show_url)

        elif args.which_parser == 'cite':
            logging.debug('Arguments match to request single DOI')

            # check if given style is valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            req = Request()
            result = req.citation(req.prepare_citation_query(args.identifier),
                    style=args.style)
            req.print_citation(result)
            if args.copy:
                Clipboard.copy_to(result)

        elif args.which_parser == 'download':
            logging.debug('Arguments match to download single DOI')

            try:
                os.makedirs(os.path.expanduser(args.destination))
                logging.debug("Destination dir {} created.".format(
                    args.destination))
            except FileExistsError:
                logging.debug("Destination dir {} does already exists".format(
                    args.destination))

            req = Request()
            links = req.get_download_links(args.identifier)
            for link in links:
                url = link.get_url()
                d = Downloader()
                filepath = d.download(url, 
                    os.path.expanduser(args.destination),
                    "{}.pdf".format(args.identifier.replace("/", "_")))
                if filepath is not None:
                    logging.info("Saved file as {}".format(filepath))

            if len(links) == 0:
                logging.info("No valid download URLs found. Aborting.")

        elif args.which_parser == 'bulk':
            logging.debug('Arguments match with bulk conversion')

            # check if given style valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            b = BulkConverter()
            if args.output == sys.stdout:
                # switch to quiet mode, since we do not want to place
                # unneccesary messages on stdout
                logging.getLogger().setLevel(logging.CRITICAL)
            b.run(args.input, args.output, style=args.style)

        elif args.which_parser == 'service':
            logging.debug('Arguments match with service call')

            if args.rebuild_api_types:
                api.rebuild_valid_identifier(api.TYPE_TYPES)

            if args.rebuild_api_styles:
                api.rebuild_valid_identifier(api.TYPE_STYLES)
def download_from_csv(postfix,
                      save_dir,
                      csv_file_path,
                      is_download_main_paper=True,
                      is_download_supplement=True,
                      time_step_in_seconds=5,
                      total_paper_number=None,
                      downloader='IDM'):
    """
    download paper and supplement files and save them to save_dir/main_paper and save_dir/supplement
        respectively
    :param postfix: str, postfix that will be added at the end of papers' title
    :param save_dir: str, paper and supplement material's save path
    :param csv_file_path: str, the full path to csv file
    :param is_download_main_paper: bool, True for downloading main paper
    :param is_download_supplement: bool, True for downloading supplemental material
    :param time_step_in_seconds: int, the interval time between two downloading request in seconds
    :param total_paper_number: int, the total number of papers that is going to download
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'.
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    if not os.path.exists(csv_file_path):
        raise ValueError(f'ERROR: file not found in {csv_file_path}!!!')

    main_save_path = os.path.join(save_dir, 'main_paper')
    if is_download_main_paper:
        os.makedirs(main_save_path, exist_ok=True)
    if is_download_supplement:
        supplement_save_path = os.path.join(save_dir, 'supplement')
        os.makedirs(supplement_save_path, exist_ok=True)

    error_log = []
    with open(csv_file_path, newline='') as csvfile:
        myreader = csv.DictReader(csvfile, delimiter=',')
        pbar = tqdm(myreader)
        i = 0
        for this_paper in pbar:
            is_grouped = ('group' in this_paper)
            i += 1
            # get title
            if is_grouped:
                group = slugify(this_paper['group'])
            title = slugify(this_paper['title'])
            if total_paper_number is not None:
                pbar.set_description(
                    f'Downloading paper {i}/{total_paper_number}')
            else:
                pbar.set_description(f'Downloading paper {i}')
            this_paper_main_path = os.path.join(main_save_path,
                                                f'{title}_{postfix}.pdf')
            if is_grouped:
                this_paper_main_path = os.path.join(main_save_path, group,
                                                    f'{title}_{postfix}.pdf')
            if is_download_supplement:
                this_paper_supp_path_no_ext = os.path.join(
                    supplement_save_path, f'{title}_{postfix}_supp.')
                if is_grouped:
                    this_paper_supp_path_no_ext = os.path.join(
                        supplement_save_path, group,
                        f'{title}_{postfix}_supp.')
                if '' != this_paper['supplemental link'] and os.path.exists(this_paper_main_path) and \
                        (os.path.exists(this_paper_supp_path_no_ext + 'zip') or os.path.exists(
                            this_paper_supp_path_no_ext + 'pdf')):
                    continue
                elif '' == this_paper['supplemental link'] and os.path.exists(
                        this_paper_main_path):
                    continue
            elif os.path.exists(this_paper_main_path):
                continue
            if 'error' == this_paper['main link']:
                error_log.append((title, 'no MAIN link'))
            elif '' != this_paper['main link']:
                if is_grouped:
                    if is_download_main_paper:
                        os.makedirs(os.path.join(main_save_path, group),
                                    exist_ok=True)
                    if is_download_supplement:
                        os.makedirs(os.path.join(supplement_save_path, group),
                                    exist_ok=True)
                if is_download_main_paper:
                    try:
                        # download paper with IDM
                        if not os.path.exists(this_paper_main_path):
                            downloader.download(
                                urls=this_paper['main link'].replace(
                                    ' ', '%20'),
                                save_path=os.path.join(os.getcwd(),
                                                       this_paper_main_path),
                                time_sleep_in_seconds=time_step_in_seconds)
                    except Exception as e:
                        # error_flag = True
                        print('Error: ' + title + ' - ' + str(e))
                        error_log.append((title, this_paper['main link'],
                                          'main paper download error', str(e)))
                # download supp
                if is_download_supplement:
                    # check whether the supp can be downloaded
                    if not (os.path.exists(this_paper_supp_path_no_ext + 'zip')
                            or os.path.exists(this_paper_supp_path_no_ext +
                                              'pdf')):
                        if 'error' == this_paper['supplemental link']:
                            error_log.append((title, 'no SUPPLEMENTAL link'))
                        elif '' != this_paper['supplemental link']:
                            supp_type = this_paper['supplemental link'].split(
                                '.')[-1]
                            try:
                                downloader.download(
                                    urls=this_paper['supplemental link'],
                                    save_path=os.path.join(
                                        os.getcwd(),
                                        this_paper_supp_path_no_ext +
                                        supp_type),
                                    time_sleep_in_seconds=time_step_in_seconds)
                            except Exception as e:
                                # error_flag = True
                                print('Error: ' + title + ' - ' + str(e))
                                error_log.append(
                                    (title, this_paper['supplemental link'],
                                     'supplement download error', str(e)))

        # 2. write error log
        print('write error log')
        with open('..\\log\\download_err_log.txt', 'w') as f:
            for log in tqdm(error_log):
                for e in log:
                    if e is not None:
                        f.write(e)
                    else:
                        f.write('None')
                    f.write('\n')

                f.write('\n')

    return True
Example #11
0
def download_paper(year,
                   save_dir,
                   is_download_supplement=True,
                   time_step_in_seconds=5,
                   downloader='IDM'):
    """
    download all ICML paper and supplement files given year, restore in save_dir/main_paper and save_dir/supplement
    respectively
    :param year: int, ICML year, such 2019
    :param save_dir: str, paper and supplement material's save path
    :param is_download_supplement: bool, True for downloading supplemental material
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    ICML_year_dict = {
        2021: 139,
        2020: 119,
        2019: 97,
        2018: 80,
        2017: 70,
        2016: 48,
        2015: 37,
        2014: 32,
        2013: 28
    }
    if year >= 2013:
        init_url = f'http://proceedings.mlr.press/v{ICML_year_dict[year]}/'
    elif year == 2012:
        init_url = 'https://icml.cc/2012/papers.1.html'
    elif year == 2011:
        init_url = 'http://www.icml-2011.org/papers.php'
    elif 2009 == year:
        init_url = 'https://icml.cc/Conferences/2009/abstracts.html'
    elif 2008 == year:
        init_url = 'http://www.machinelearning.org/archive/icml2008/abstracts.shtml'
    elif 2007 == year:
        init_url = 'https://icml.cc/Conferences/2007/paperlist.html'
    elif year in [2006, 2004, 2005]:
        init_url = f'https://icml.cc/Conferences/{year}/proceedings.html'
    elif 2003 == year:
        init_url = 'https://aaai.org/Library/ICML/icml03contents.php'
    else:
        raise ValueError('''the given year's url is unknown !''')

    postfix = f'ICML_{year}'
    if os.path.exists(f'..\\urls\\init_url_icml_{year}.dat'):
        with open(f'..\\urls\\init_url_icml_{year}.dat', 'rb') as f:
            content = pickle.load(f)
    else:
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
        }
        req = urllib.request.Request(url=init_url, headers=headers)
        content = urllib.request.urlopen(req).read()
        # content = open(f'..\\ICML_{year}.html', 'rb').read()
        with open(f'..\\urls\\init_url_icml_{year}.dat', 'wb') as f:
            pickle.dump(content, f)
    # soup = BeautifulSoup(content, 'html.parser')
    soup = BeautifulSoup(content, 'html5lib')
    # soup = BeautifulSoup(open(r'..\ICML_2011.html', 'rb'), 'html.parser')
    error_log = []
    if year >= 2013:
        if year in ICML_year_dict.keys():
            volume = f'v{ICML_year_dict[year]}'
        else:
            raise ValueError('''the given year's url is unknown !''')

        pmlr.download_paper_given_volume(
            volume=volume,
            save_dir=save_dir,
            postfix=postfix,
            is_download_supplement=is_download_supplement,
            time_step_in_seconds=time_step_in_seconds,
            downloader=downloader.downloader)
    elif 2012 == year:  # 2012
        # base_url = f'https://icml.cc/{year}/'
        paper_list_bar = tqdm(soup.find_all('div', {'class': 'paper'}))
        paper_index = 0
        for paper in paper_list_bar:
            paper_index += 1
            title = ''
            title = slugify(paper.find('h2').text)
            link = None
            for a in paper.find_all('a'):
                if 'ICML version (pdf)' == a.text:
                    link = urllib.parse.urljoin(init_url, a.get('href'))
                    break
            if link is not None:
                this_paper_main_path = os.path.join(
                    save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                paper_list_bar.set_description(
                    f'find paper {paper_index}:{title}')
                if not os.path.exists(this_paper_main_path):
                    paper_list_bar.set_description(
                        f'downloading paper {paper_index}:{title}')
                    downloader.download(
                        urls=link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
            else:
                error_log.append((title, 'no main link error'))
    elif 2011 == year:
        paper_list_bar = tqdm(soup.find_all('a'))
        paper_index = 0
        for paper in paper_list_bar:
            h3 = paper.find('h3')
            if h3 is not None:
                title = slugify(h3.text)
                paper_index += 1
            if 'download' == slugify(paper.text.strip()):
                link = paper.get('href')
                link = urllib.parse.urljoin(init_url, link)
                if link is not None:
                    this_paper_main_path = os.path.join(
                        save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                    paper_list_bar.set_description(
                        f'find paper {paper_index}:{title}')
                    if not os.path.exists(this_paper_main_path):
                        paper_list_bar.set_description(
                            f'downloading paper {paper_index}:{title}')
                        downloader.download(
                            urls=link,
                            save_path=this_paper_main_path,
                            time_sleep_in_seconds=time_step_in_seconds)
                else:
                    error_log.append((title, 'no main link error'))
    elif year in [2009, 2008]:
        if 2009 == year:
            paper_list_bar = tqdm(
                soup.find('div', {
                    'id': 'right_column'
                }).find_all(['h3', 'a']))
        elif 2008 == year:
            paper_list_bar = tqdm(
                soup.find('div', {
                    'class': 'content'
                }).find_all(['h3', 'a']))
        paper_index = 0
        title = None
        for paper in paper_list_bar:
            if 'h3' == paper.name:
                title = slugify(paper.text)
                paper_index += 1
            elif 'full-paper' == slugify(paper.text.strip()):  # a
                link = paper.get('href')
                if link is not None and title is not None:
                    link = urllib.parse.urljoin(init_url, link)
                    this_paper_main_path = os.path.join(
                        save_dir, f'{title}_{postfix}.pdf')
                    paper_list_bar.set_description(
                        f'find paper {paper_index}:{title}')
                    if not os.path.exists(this_paper_main_path):
                        paper_list_bar.set_description(
                            f'downloading paper {paper_index}:{title}')
                        downloader.download(
                            urls=link,
                            save_path=this_paper_main_path,
                            time_sleep_in_seconds=time_step_in_seconds)
                    title = None
                else:
                    error_log.append((title, 'no main link error'))
    elif year in [2006, 2005]:
        paper_list_bar = tqdm(soup.find_all('a'))
        paper_index = 0
        for paper in paper_list_bar:
            title = slugify(paper.text.strip())
            link = paper.get('href')
            paper_index += 1
            if link is not None and title is not None and (
                    'pdf' == link[-3:] or 'ps' == link[-2:]):
                link = urllib.parse.urljoin(init_url, link)
                this_paper_main_path = os.path.join(
                    save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                paper_list_bar.set_description(
                    f'find paper {paper_index}:{title}')
                if not os.path.exists(this_paper_main_path):
                    paper_list_bar.set_description(
                        f'downloading paper {paper_index}:{title}')
                    downloader.download(
                        urls=link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
    elif 2004 == year:
        paper_index = 0
        paper_list_bar = tqdm(
            soup.find('table', {
                'class': 'proceedings'
            }).find_all('tr'))
        title = None
        for paper in paper_list_bar:
            tr_class = None
            try:
                tr_class = paper.get('class')[0]
            except:
                pass
            if 'proc_2004_title' == tr_class:  # title
                title = slugify(paper.text.strip())
                paper_index += 1
            else:
                for a in paper.find_all('a'):
                    if '[Paper]' == a.text:
                        link = a.get('href')
                        if link is not None and title is not None:
                            link = urllib.parse.urljoin(init_url, link)
                            this_paper_main_path = os.path.join(
                                save_dir,
                                f'{title}_{postfix}.pdf'.replace(' ', '_'))
                            paper_list_bar.set_description(
                                f'find paper {paper_index}:{title}')
                            if not os.path.exists(this_paper_main_path):
                                paper_list_bar.set_description(
                                    f'downloading paper {paper_index}:{title}')
                                downloader.download(
                                    urls=link,
                                    save_path=this_paper_main_path,
                                    time_sleep_in_seconds=time_step_in_seconds)
                        break
    elif 2003 == year:
        paper_index = 0
        paper_list_bar = tqdm(
            soup.find('div', {
                'id': 'content'
            }).find_all('p', {'class': 'left'}))
        for paper in paper_list_bar:
            abs_link = None
            title = None
            link = None
            for a in paper.find_all('a'):
                abs_link = urllib.parse.urljoin(init_url, a.get('href'))
                if abs_link is not None:
                    title = slugify(a.text.strip())
                    break
            if title is not None:
                paper_index += 1
                this_paper_main_path = os.path.join(
                    save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                paper_list_bar.set_description(
                    f'find paper {paper_index}:{title}')
                if not os.path.exists(this_paper_main_path):
                    if abs_link is not None:
                        headers = {
                            'User-Agent':
                            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
                        }
                        req = urllib.request.Request(url=abs_link,
                                                     headers=headers)
                        for i in range(3):
                            try:
                                abs_content = urllib.request.urlopen(
                                    req, timeout=10).read()
                                break
                            except Exception as e:
                                if i == 2:
                                    print('error' + title + str(e))
                                    error_log.append(
                                        (title, abs_link, 'download error',
                                         str(e)))
                        abs_soup = BeautifulSoup(abs_content, 'html5lib')
                        for a in abs_soup.find_all('a'):
                            try:
                                if 'pdf' == a.get('href')[-3:]:
                                    link = urllib.parse.urljoin(
                                        abs_link, a.get('href'))
                                    if link is not None:
                                        paper_list_bar.set_description(
                                            f'downloading paper {paper_index}:{title}'
                                        )
                                        downloader.download(
                                            urls=link,
                                            save_path=this_paper_main_path,
                                            time_sleep_in_seconds=
                                            time_step_in_seconds)
                                    break
                            except:
                                pass

    # write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')

            f.write('\n')
Example #12
0
def download_paper_given_volume(volume,
                                save_dir,
                                postfix,
                                is_download_supplement=True,
                                time_step_in_seconds=5,
                                downloader='IDM'):
    """
    download main and supplement papers from PMLR.
    :param volume: str, such as 'v1', 'r1'
    :param save_dir: str, paper and supplement material's save path
    :param postfix: str, the postfix will be appended to the end of papers' titles
    :param is_download_supplement: bool, True for downloading supplemental material
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    init_url = f'http://proceedings.mlr.press/{volume}/'

    if is_download_supplement:
        main_save_path = os.path.join(save_dir, 'main_paper')
        supplement_save_path = os.path.join(save_dir, 'supplement')
        os.makedirs(main_save_path, exist_ok=True)
        os.makedirs(supplement_save_path, exist_ok=True)
    else:
        main_save_path = save_dir
        os.makedirs(main_save_path, exist_ok=True)
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }
    req = urllib.request.Request(url=init_url, headers=headers)
    content = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(content, 'html.parser')
    paper_list = soup.find_all('div', {'class': 'paper'})
    error_log = []
    title_list = []
    num_download = len(paper_list)
    pbar = tqdm(zip(paper_list, range(num_download)))
    for paper in pbar:
        # get title
        this_paper = paper[0]
        title = slugify(this_paper.find_all('p', {'class': 'title'})[0].text)
        try:
            pbar.set_description(
                f'Downloading paper {paper[1] + 1}/{num_download}: {title}')
        except:
            pbar.set_description(
                f'''Downloading paper {paper[1] + 1}/{num_download}: {title.encode('utf8')}'''
            )
        title_list.append(title)

        this_paper_main_path = os.path.join(main_save_path,
                                            f'{title}_{postfix}.pdf')
        if is_download_supplement:
            this_paper_supp_path = os.path.join(supplement_save_path,
                                                f'{title}_{postfix}_supp.pdf')
            this_paper_supp_path_no_ext = os.path.join(
                supplement_save_path, f'{title}_{postfix}_supp.')

            if os.path.exists(this_paper_main_path) and os.path.exists(
                    this_paper_supp_path):
                continue
        else:
            if os.path.exists(this_paper_main_path):
                continue

        # get abstract page url
        links = this_paper.find_all('p', {'class': 'links'})[0].find_all('a')
        supp_link = None
        main_link = None
        for link in links:
            if 'Download PDF' == link.text or 'pdf' == link.text:
                main_link = link.get('href')
            elif is_download_supplement and (
                    'Supplementary PDF' == link.text or 'Supplementary Material' == link.text or \
                    'supplementary' == link.text):
                supp_link = link.get('href')
                if supp_link[-3:] != 'pdf':
                    this_paper_supp_path = this_paper_supp_path_no_ext + supp_link[
                        -3:]

        # try 1 time
        # error_flag = False
        for d_iter in range(1):
            try:
                # download paper with IDM
                if not os.path.exists(
                        this_paper_main_path) and main_link is not None:
                    downloader.download(
                        urls=main_link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
            except Exception as e:
                # error_flag = True
                print('Error: ' + title + ' - ' + str(e))
                error_log.append(
                    (title, main_link, 'main paper download error', str(e)))
            # download supp
            if is_download_supplement:
                # check whether the supp can be downloaded
                if not os.path.exists(
                        this_paper_supp_path) and supp_link is not None:
                    try:
                        downloader.download(
                            urls=supp_link,
                            save_path=this_paper_supp_path,
                            time_sleep_in_seconds=time_step_in_seconds)
                    except Exception as e:
                        # error_flag = True
                        print('Error: ' + title + ' - ' + str(e))
                        error_log.append(
                            (title, supp_link, 'supplement download error',
                             str(e)))

    # write error log
    print('writing error log...')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')
            f.write('\n')

    return True