Python Downloader.download Examples

Programming Language: Python

Namespace/Package Name: lib.downloader

Class/Type: Downloader

Method/Function: download

Examples at hotexamples.com: 12

Python Downloader.download - 12 examples found. These are the top rated real world Python examples of lib.downloader.Downloader.download extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Downloader(17)

download(11)

get(5)

download_files(2)

to_filename(2)

get_lists(1)

scrape(1)

Example #1

Show file

File: paper_downloader_ECCV.py Project: SilenceEagle/paper_downloader

def __download_from_springer(url,
                             save_dir,
                             year,
                             is_workshops=False,
                             time_sleep_in_seconds=5,
                             downloader='IDM'):
    downloader = Downloader(downloader)
    for i in range(3):
        try:
            papers_dict = springer.get_paper_name_link_from_url(url)
            break
        except Exception as e:
            print(str(e))
    # total_paper_number = len(papers_dict)
    pbar = tqdm(papers_dict.keys())
    postfix = f'ECCV_{year}'
    if is_workshops:
        postfix = f'ECCV_WS_{year}'

    for name in pbar:
        pbar.set_description(f'Downloading paper {name}')
        if not os.path.exists(os.path.join(save_dir, f'{name}_{postfix}.pdf')):
            downloader.download(
                papers_dict[name],
                os.path.join(save_dir, f'{name}_{postfix}.pdf'),
                time_sleep_in_seconds)

Example #2

Show file

File: analytics.py Project: shravanc/aws_scripts

class Analytics:
    def __init__(self, config):
        self.config = config

        self.downloader = Downloader(config)
        self.parser = Parser(config)
        self.analyser = Analyser(config)

    def generate_report(self):
        if self.config.download:
            self.downloader.download()
        self.parser.parse()
        self.analyser.analyse()
        return self.config

Example #3

Show file

File: analytics.py Project: shravanc/aws_scripts

class Analytics:
    def __init__(self, config):
        self.config = config

        self.downloader = Downloader(config)
        self.parser = Parser(config)
        self.analyser = Analyser(config)

    def download_logs(self):
        print("here")
        if self.config.download:
            self.downloader.download()

    def generate_report(self):
        #if self.config.download:
        #    self.downloader.download()
        self.parser.parse()
        self.analyser.analyse()
        return self.config

Example #4

Show file

File: paper_downloader_ICLR.py Project: SilenceEagle/paper_downloader

def download_iclr_paper_given_html_file(year, html_path, save_dir, time_step_in_seconds=10, downloader='IDM'):
    """
    download iclr conference paper given html file (current only support 2021)
    :param year: int, iclr year, current only support year >= 2018
    :param html_path: str, html file's full pathname
    :param save_dir: str, paper save path
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    base_url = f'https://openreview.net/group?id=ICLR.cc/{year}'
    content = open(html_path, 'rb').read()
    soup = BeautifulSoup(content, 'html5lib')
    divs = soup.find('div', {'class': 'tabs-container'})
    oral_papers = divs.find('div', {'id': 'oral-presentations'}).find_all('li', {'class': 'note'})
    num_oral_papers = len(oral_papers)
    print('found number of oral papers:', num_oral_papers)

    spotlight_papers = divs.find('div', {'id': 'spotlight-presentations'}).find_all('li', {'class': 'note'})
    num_spotlight_papers = len(spotlight_papers)
    print('found number of spotlight papers:', num_spotlight_papers)

    poster_papers = divs.find('div', {'id': 'poster-presentations'}).find_all('li', {'class': 'note'})
    num_poster_papers = len(poster_papers)
    print('found number of poster papers:', num_poster_papers)

    paper_postfix = f'ICLR_{year}'
    error_log = []

    # oral
    oral_save_dir = os.path.join(save_dir, 'oral')
    print('downloading oral papers...........')
    os.makedirs(oral_save_dir, exist_ok=True)
    for index, paper in enumerate(oral_papers):
        a_hrefs = paper.find_all("a")
        name = slugify(a_hrefs[0].text.strip())
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(oral_save_dir, pdf_name)):
            link = a_hrefs[1].get('href')
            link = urllib.parse.urljoin(base_url, link)
            print('Downloading paper {}/{}: {}'.format(index + 1, num_oral_papers, name))
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(oral_save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                        )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
                    # time.sleep(time_step_in_seconds)
            if not success_flag:
                error_log.append((name, link))

    # spotlight
    spotlight_save_dir = os.path.join(save_dir, 'spotlight')
    print('downloading spotlight papers...........')
    os.makedirs(spotlight_save_dir, exist_ok=True)
    for index, paper in enumerate(spotlight_papers):
        a_hrefs = paper.find_all("a")
        name = slugify(a_hrefs[0].text.strip())
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(spotlight_save_dir, pdf_name)):
            link = a_hrefs[1].get('href')
            link = urllib.parse.urljoin(base_url, link)
            print('Downloading paper {}/{}: {}'.format(index + 1, num_spotlight_papers, name))
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(spotlight_save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                    )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
                    # time.sleep(time_step_in_seconds)
            if not success_flag:
                error_log.append((name, link))

    # poster
    poster_save_dir = os.path.join(save_dir, 'poster')
    print('downloading poster papers...........')
    os.makedirs(poster_save_dir, exist_ok=True)
    for index, paper in enumerate(poster_papers):
        a_hrefs = paper.find_all("a")
        name = slugify(a_hrefs[0].text.strip())
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(poster_save_dir, pdf_name)):
            link = a_hrefs[1].get('href')
            link = urllib.parse.urljoin(base_url, link)
            print('Downloading paper {}/{}: {}'.format(index + 1, num_poster_papers, name))
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(poster_save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                    )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
                    # time.sleep(time_step_in_seconds)
            if not success_flag:
                error_log.append((name, link))



    # 2. write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                f.write(e)
                f.write('\n')
            f.write('\n')

Example #5

Show file

File: paper_downloader_ICLR.py Project: SilenceEagle/paper_downloader

def download_iclr_paper(year, save_dir, time_step_in_seconds=5, downloader='IDM', is_use_arxiv_mirror=False):
    """
    download iclr conference paper for year 2014, 2015 and 2016
    :param year: int, iclr year
    :param save_dir: str, paper save path
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    paper_postfix = f'ICLR_{year}'
    if year == 2016:
        base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2016:main.html'
    elif year == 2015:
        base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2015:main.html'
    elif year == 2014:
        base_url = 'https://iclr.cc/archive/2014/conference-proceedings/'
    else:
        raise ValueError('the website url is not given for this year!')
    os.makedirs(save_dir, exist_ok=True)
    if year == 2015:  # oral and poster seperated
        oral_save_path = os.path.join(save_dir, 'oral')
        poster_save_path = os.path.join(save_dir, 'poster')
        workshop_save_path = os.path.join(save_dir, 'ws')
        os.makedirs(oral_save_path, exist_ok=True)
        os.makedirs(poster_save_path, exist_ok=True)
        os.makedirs(workshop_save_path, exist_ok=True)
    if os.path.exists(f'..\\urls\\init_url_iclr_{year}.dat'):
        with open(f'..\\urls\\init_url_iclr_{year}.dat', 'rb') as f:
            content = pickle.load(f)
    else:
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=base_url, headers=headers)
        content = urllib.request.urlopen(req).read()
        with open(f'..\\urls\\init_url_iclr_{year}.dat', 'wb') as f:
            pickle.dump(content, f)
    error_log = []
    soup = BeautifulSoup(content, 'html.parser')
    print('open url successfully!')
    if year == 2016:
        papers = soup.find('h3', {'id': 'accepted_papers_conference_track'}).findNext('div').find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, title+f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                                urls=pdf_link,
                                save_path=os.path.join(save_dir, pdf_name),
                                time_sleep_in_seconds=time_step_in_seconds
                            )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
        # workshops
        papers = soup.find('h3', {'id': 'workshop_track_posters_may_2nd'}).findNext('div').find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://beta.openreview'):
                title = slugify(paper.text)
                pdf_name = f'{title}_ICLR_WS_{year}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)):
                        pdf_link = get_pdf_link_from_openreview(link)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(save_dir, 'ws', pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds
                        )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
        papers = soup.find('h3', {'id': 'workshop_track_posters_may_3rd'}).findNext('div').find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://beta.openreview'):
                title = slugify(paper.text)
                pdf_name = f'{title}_ICLR_WS_{year}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)):
                        pdf_link = get_pdf_link_from_openreview(link)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(save_dir, 'ws', pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds
                        )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
    elif year == 2015:
        # oral papers
        oral_papers = soup.find('h3', {'id': 'conference_oral_presentations'}).findNext('div').find_all('a')
        for paper in tqdm(oral_papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(oral_save_path, title+f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(oral_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds
                        )
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))

        # workshops papers
        workshop_papers = soup.find('h3', {'id': 'may_7_workshop_poster_session'}).findNext('div').find_all('a')
        workshop_papers.append(
            soup.find('h3', {'id': 'may_8_workshop_poster_session'}).findNext('div').find_all('a'))
        for paper in tqdm(workshop_papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_ICLR_WS_{year}.pdf'
                try:
                    if not os.path.exists(os.path.join(workshop_save_path, title + f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(workshop_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
        # poster papers
        poster_papers = soup.find('h3', {'id': 'may_9_conference_poster_session'}).findNext('div').find_all('a')
        for paper in tqdm(poster_papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(poster_save_path, title + f'_{paper_postfix}.pdf')):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(poster_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))
    elif year == 2014:
        papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(save_dir, pdf_name)):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(save_dir, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))

        # workshops
        paper_postfix = f'ICLR_WS_{year}'
        base_url = 'https://sites.google.com/site/representationlearning2014/workshop-proceedings'
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
        req = urllib.request.Request(url=base_url, headers=headers)
        content = urllib.request.urlopen(req).read()
        soup = BeautifulSoup(content, 'html.parser')
        workshop_save_path = os.path.join(save_dir, 'WS')
        os.makedirs(workshop_save_path, exist_ok=True)
        papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a')
        for paper in tqdm(papers):
            link = paper.get('href')
            if link.startswith('http://arxiv'):
                title = slugify(paper.text)
                pdf_name = f'{title}_{paper_postfix}.pdf'
                try:
                    if not os.path.exists(os.path.join(workshop_save_path, pdf_name)):
                        pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror)
                        print(f'downloading {title}')
                        downloader.download(
                            urls=pdf_link,
                            save_path=os.path.join(workshop_save_path, pdf_name),
                            time_sleep_in_seconds=time_step_in_seconds)
                except Exception as e:
                    # error_flag = True
                    print('Error: ' + title + ' - ' + str(e))
                    error_log.append((title, link, 'paper download error', str(e)))


    # write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')

            f.write('\n')
    return True

Example #6

Show file

File: paper_downloader_ICLR.py Project: SilenceEagle/paper_downloader

def download_iclr_spotlight_papers(save_dir, driver_path, year, base_url=None, time_step_in_seconds=10, downloader='IDM'):
    """

    :param save_dir: str, paper save path
    :param driver_path: str, 'chromedriver.exe' full pathname
    :param year: int, iclr year, current only support year >= 2018
    :param base_url: str, paper website url
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return:
    """
    downloader = Downloader(downloader=downloader)
    if base_url is None:
        if year >= 2021:
            base_url = 'https://openreview.net/group?id=ICLR.cc/2021/Conference#spotlight-presentations'
        elif year == 2020:
            base_url = 'https://openreview.net/group?id=ICLR.cc/2020/Conference#accept-spotlight'
        else:
            raise ValueError('the website url is not given for this year!')
    first_poster_index = {'2017': 15}
    paper_postfix = f'ICLR_{year}'
    error_log = []
    driver = webdriver.Chrome(driver_path)
    driver.get(base_url)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # wait for the select element to become visible
    print('Starting web driver wait...')
    wait = WebDriverWait(driver, 20)
    print('Starting web driver wait... finished')
    res = wait.until(EC.presence_of_element_located((By.ID, "notes")))
    print("Successful load the website!->",res)
    res = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "note")))
    print("Successful load the website notes!->",res)
    # parse the results

    if year >= 2021:
        divs = driver.find_elements_by_xpath('//*[@id="spotlight-presentations"]/ul/li')
    elif year == 2020:
        divs = driver.find_elements_by_xpath('//*[@id="accept-spotlight"]/ul/li')
    else:
        divs = driver.find_elements_by_class_name('note')[:first_poster_index[str(year)]]
    num_papers = len(divs)
    print('found number of papers:',num_papers)
    for index, paper in enumerate(divs):
        a_hrefs = paper.find_elements_by_tag_name("a")
        if year >= 2018:
            name = slugify(a_hrefs[0].text.strip())
            link = a_hrefs[1].get_attribute('href')
        else:
            name = slugify(paper.find_element_by_class_name('note_content_title').text)
            link = paper.find_element_by_class_name('note_content_pdf').get_attribute('href')
        print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name))
        pdf_name = name + '_' + paper_postfix + '.pdf'
        if not os.path.exists(os.path.join(save_dir, pdf_name)):
            # try 1 times
            success_flag = False
            for d_iter in range(1):
                try:
                    downloader.download(
                        urls=link,
                        save_path=os.path.join(save_dir, pdf_name),
                        time_sleep_in_seconds=time_step_in_seconds
                        )
                    success_flag = True
                    break
                except Exception as e:
                    print('Error: ' + name + ' - ' + str(e))
            if not success_flag:
                error_log.append((name, link))
    driver.close()
    # 2. write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                f.write(e)
                f.write('\n')
            f.write('\n')

Example #7

Show file

File: doimgr.py Project: dotcs/doimgr

def main(argv):
    config = configparser.ConfigParser()
    config_path = os.path.expanduser(os.path.join("~", ".doimgrrc"))
    if os.path.isfile(config_path):
        config.read(config_path)

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Command line based tool to request DOI data and convert \
it to BibTex entries.')
    subparsers = parser.add_subparsers()

    parser_search = subparsers.add_parser(
        'search',
        help='Search database for published articles to find relevant DOIs',
        description="""Searches database for published articles. This can be
used to find a specific DOI or getting information about a keyword/topic.""")
    parser_search.add_argument('query', type=str, help='search string')
    parser_search.add_argument(
        '--show-authors',
        action='store_true',
        default=config.getboolean('search', 'show-authors', fallback=False),
        help='if set additional author information is shown')
    parser_search.add_argument(
        '--show-type',
        action='store_true',
        default=config.getboolean('search', 'show-type', fallback=False),
        help='if set additional information about the type is shown')
    parser_search.add_argument(
        '--show-publisher',
        action='store_true',
        default=config.getboolean('search', 'show-publisher', fallback=False),
        help='if set additional information about the publisher is shown')
    parser_search.add_argument('--show-url',
                               action='store_true',
                               default=config.getboolean('search',
                                                         'show-url',
                                                         fallback=False),
                               help='if set a URL to the document is shown')
    allowed_sort_types = [
        'score', 'updated', 'deposited', 'indexed', 'published'
    ]
    parser_search.add_argument('--sort', type=str, choices=allowed_sort_types,
        default=config.get('search', 'sort', fallback='score'),
        help='sorting of search queries; allowed values are {}'\
            .format(", ".join(allowed_sort_types)), metavar='')
    parser_search.add_argument('--order',
                               type=str,
                               choices=['asc', 'desc'],
                               default=config.get('search',
                                                  'order',
                                                  fallback='desc'),
                               help='ordering of search queries')
    parser_search.add_argument('--year',
                               type=int,
                               default=config.getint('search',
                                                     'year',
                                                     fallback=None),
                               help='limit the year')
    parser_search.add_argument('--rows',
                               type=int,
                               default=config.getint('search',
                                                     'rows',
                                                     fallback=20),
                               help='number of rows to load')
    parser_search.add_argument('--color',
                               action="store_true",
                               default=config.getboolean('search',
                                                         'color',
                                                         fallback=False),
                               help='if set, colored output is used')
    valid_colors = [
        'black', 'cyan', 'magenta', 'yellow', 'blue', 'green', 'red', 'white'
    ]
    parser_search.add_argument('--color-doi',
                               type=str,
                               default=config.get('search',
                                                  'color-doi',
                                                  fallback='red'),
                               choices=valid_colors,
                               help='color for DOIs')
    parser_search.add_argument('--color-title',
                               type=str,
                               default=config.get('search',
                                                  'color-title',
                                                  fallback='green'),
                               choices=valid_colors,
                               help='color for titles')
    parser_search.add_argument('--color-more',
                               type=str,
                               default=config.get('search',
                                                  'color-more',
                                                  fallback='blue'),
                               choices=valid_colors,
                               help='color for additional information such as \
authors, URLs, etc.')

    # receive allowed types via http://api.crossref.org/types
    allowed_types = api.get_valid_types()
    parser_search.add_argument('--type', type=str, choices=allowed_types,
        default=config.get('search', 'type', fallback=None),
        help='selects a single type; allowed values are {}'.format(", "\
            .join(allowed_types)),
        metavar='')
    parser_search.set_defaults(which_parser='search')

    parser_cite = subparsers.add_parser(
        'cite',
        help='Cite article based on DOI in different citation formats',
        description="""Cite articles with a known DOI. Formatting can be done
using the `style`-parameter and supports hundreds of different citation
formats. A full list of supported formats can be found in the subfolder
`API/styles.txt`. The most common ones are `apa` and `bibtex`.""")
    parser_cite.add_argument('identifier', type=str, help='DOI identifier')
    parser_cite.add_argument('-s',
                             '--style',
                             type=str,
                             default=config.get('cite',
                                                'style',
                                                fallback="bibtex"),
                             help='Citation style')
    parser_cite.add_argument(
        '-c',
        '--copy',
        action='store_true',
        default=config.get('cite', 'copy', fallback=False),
        help="""Copies the result to the system clipboard""")
    parser_cite.set_defaults(which_parser='cite')

    parser_download = subparsers.add_parser(
        'download',
        help='Download articles based on their DOI',
        description="""Downloads articles, if a full text verison is provided
by the authors.""")
    parser_download.add_argument('identifier', type=str, help='DOI identifier')
    parser_download.add_argument('-d',
                                 '--destination',
                                 type=str,
                                 default=config.get('download',
                                                    'destination',
                                                    fallback="."),
                                 help='download destination')
    parser_download.set_defaults(which_parser='download')

    parser_bulk = subparsers.add_parser(
        'bulk',
        help='Mass converting for multiple DOIs listed in a single file.',
        description=
        """Mass converting for multiple DOIs listed in a single file.""")
    parser_bulk.add_argument('input',
                             type=argparse.FileType('r'),
                             help='input file path',
                             nargs='?',
                             default=sys.stdin)
    parser_bulk.add_argument('output',
                             type=argparse.FileType('w'),
                             help='output file path',
                             nargs='?',
                             default=sys.stdout)
    parser_bulk.add_argument('-s',
                             '--style',
                             type=str,
                             default=config.get('bulk',
                                                'style',
                                                fallback="bibtex"),
                             help='Citation style')
    parser_bulk.set_defaults(which_parser='bulk')

    parser_service = subparsers.add_parser(
        'service',
        help='Provices service functions for the API such as rebuilding the \
database of valid types and styles',
        description="""Provices service functions for the API such as
rebuilding the database of valid types and styles""")
    parser_service.add_argument(
        '--rebuild-api-types',
        action='store_true',
        help='Rebuild the types, that are accepted on API requests')
    parser_service.add_argument(
        '--rebuild-api-styles',
        action='store_true',
        help='Rebuild the styles, that are accepted on API requests')
    parser_service.set_defaults(which_parser='service')

    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true',
        default=config.getboolean('general', 'quiet', fallback=False),
        help='turns off all unnecessary outputs; use this for scripting')
    parser.add_argument('--log-level',
                        type=str,
                        choices=['info', 'debug'],
                        default=config.get('general',
                                           'log-level',
                                           fallback="info"),
                        help='set the logging level')
    parser.add_argument('--version',
                        action="store_true",
                        help='shows the version of doimgr')

    args = parser.parse_args()

    if args.version:
        print("doimgr version: {}".format(__version__))
        sys.exit()

    # set the logging levels according to the users choice
    if args.quiet:
        level = logging.CRITICAL
    else:
        level = logging.INFO
        if args.log_level == 'debug':
            level = logging.DEBUG
    logging.basicConfig(level=level)

    logging.debug("doimgr version {}".format(__version__))

    if hasattr(args, 'which_parser'):
        if args.which_parser == 'search':
            logging.debug('Arguments match to perform search')
            req = Request()
            if sys.stdout.isatty():
                # only allow colors when the script's output is not redirected
                req.set_colored_output(args.color,
                                       doi=args.color_doi,
                                       title=args.color_title,
                                       more=args.color_more)
            else:
                logging.debug('Colors have been disabled due to detected \
reconnect')
            results = req.search(
                req.prepare_search_query(args.query, args.sort, args.order,
                                         args.year, args.type, args.rows))
            req.print_search_content(results, args.show_authors,
                                     args.show_type, args.show_publisher,
                                     args.show_url)

        elif args.which_parser == 'cite':
            logging.debug('Arguments match to request single DOI')

            # check if given style is valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            req = Request()
            result = req.citation(req.prepare_citation_query(args.identifier),
                                  style=args.style)
            req.print_citation(result)
            if args.copy:
                Clipboard.copy_to(result)

        elif args.which_parser == 'download':
            logging.debug('Arguments match to download single DOI')

            try:
                os.makedirs(os.path.expanduser(args.destination))
                logging.debug("Destination dir {} created.".format(
                    args.destination))
            except FileExistsError:
                logging.debug("Destination dir {} does already exists".format(
                    args.destination))

            req = Request()
            links = req.get_download_links(args.identifier)
            for link in links:
                url = link.get_url()
                d = Downloader()
                filepath = d.download(
                    url, os.path.expanduser(args.destination),
                    "{}.pdf".format(args.identifier.replace("/", "_")))
                if filepath is not None:
                    logging.info("Saved file as {}".format(filepath))

            if len(links) == 0:
                logging.info("No valid download URLs found. Aborting.")

        elif args.which_parser == 'bulk':
            logging.debug('Arguments match with bulk conversion')

            # check if given style valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            b = BulkConverter()
            if args.output == sys.stdout:
                # switch to quiet mode, since we do not want to place
                # unneccesary messages on stdout
                logging.getLogger().setLevel(logging.CRITICAL)
            b.run(args.input, args.output, style=args.style)

        elif args.which_parser == 'service':
            logging.debug('Arguments match with service call')

            if args.rebuild_api_types:
                api.rebuild_valid_identifier(api.TYPE_TYPES)

            if args.rebuild_api_styles:
                api.rebuild_valid_identifier(api.TYPE_STYLES)

Example #8

Show file

File: paper_downloader_JMLR.py Project: SilenceEagle/paper_downloader

def download_paper(volumn,
                   save_dir,
                   time_step_in_seconds=5,
                   downloader='IDM',
                   url=None,
                   is_use_url=False):
    """
    download all JMLR paper files given volumn and restore in save_dir
    respectively
    :param volumn: int, JMLR volumn, such as 2019
    :param save_dir: str, paper and supplement material's save path
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :param url: None or str, None means to download volumn papers.
    :param is_use_url: bool, if to download papers from 'url'. url couldn't be None when is_use_url is True.
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    # create current dict
    title_list = []
    # paper_dict = dict()

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }
    if not is_use_url:
        init_url = f'http://jmlr.org/papers/v{volumn}/'
        postfix = f'JMLR_v{volumn}'
        if os.path.exists(f'..\\urls\\init_url_JMLR_v{volumn}.dat'):
            with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'rb') as f:
                content = pickle.load(f)
        else:
            req = urllib.request.Request(url=init_url, headers=headers)
            content = urllib.request.urlopen(req, timeout=10).read()
            # content = open(f'..\\JMLR_{volumn}.html', 'rb').read()
            with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'wb') as f:
                pickle.dump(content, f)
    elif url is not None:
        req = urllib.request.Request(url=url, headers=headers)
        content = urllib.request.urlopen(req, timeout=10).read()
        postfix = f'JMLR'
    else:
        raise ValueError(
            ''''url' could not be None when 'is_use_url'=True!!!''')
    # soup = BeautifulSoup(content, 'html.parser')
    soup = BeautifulSoup(content, 'html5lib')
    # soup = BeautifulSoup(open(r'..\JMLR_2011.html', 'rb'), 'html.parser')
    error_log = []
    os.makedirs(save_dir, exist_ok=True)

    if (not is_use_url) and volumn <= 4:
        paper_list = soup.find('div', {'id': 'content'}).find_all('tr')
    else:
        paper_list = soup.find('div', {'id': 'content'}).find_all('dl')
    # num_download = 5 # number of papers to download
    num_download = len(paper_list)
    for paper in tqdm(zip(paper_list, range(num_download))):
        # get title
        print('\n')
        this_paper = paper[0]
        title = slugify(this_paper.find('dt').text)
        try:
            print('Downloading paper {}/{}: {}'.format(paper[1] + 1,
                                                       num_download, title))
        except:
            print(title.encode('utf8'))
        title_list.append(title)

        this_paper_main_path = os.path.join(
            save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
        if os.path.exists(this_paper_main_path):
            continue

        # get abstract page url
        links = this_paper.find_all('a')
        main_link = None
        for link in links:
            if '[pdf]' == link.text or 'pdf' == link.text:
                main_link = urllib.parse.urljoin('http://jmlr.org',
                                                 link.get('href'))
                break

        # try 1 time
        # error_flag = False
        for d_iter in range(1):
            try:
                # download paper with IDM
                if not os.path.exists(
                        this_paper_main_path) and main_link is not None:
                    downloader.download(
                        urls=main_link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
            except Exception as e:
                # error_flag = True
                print('Error: ' + title + ' - ' + str(e))
                error_log.append(
                    (title, main_link, 'main paper download error', str(e)))

    # store the results
    # 1. store in the pickle file
    # with open(f'{postfix}_pre.dat', 'wb') as f:
    #     pickle.dump(paper_dict, f)

    # 2. write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')

            f.write('\n')

Example #9

Show file

File: doimgr.py Project: dotcs/doimgr

def main(argv):
    config = configparser.ConfigParser()
    config_path = os.path.expanduser(os.path.join("~", ".doimgrrc"))
    if os.path.isfile(config_path):
        config.read(config_path)

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Command line based tool to request DOI data and convert \
it to BibTex entries.')
    subparsers = parser.add_subparsers()

    parser_search = subparsers.add_parser('search', 
        help='Search database for published articles to find relevant DOIs',
        description="""Searches database for published articles. This can be
used to find a specific DOI or getting information about a keyword/topic.""")
    parser_search.add_argument('query', type=str, help='search string')
    parser_search.add_argument('--show-authors', action='store_true',
        default=config.getboolean('search', 'show-authors', fallback=False),
        help='if set additional author information is shown')
    parser_search.add_argument('--show-type', action='store_true',
        default=config.getboolean('search', 'show-type', fallback=False),
        help='if set additional information about the type is shown')
    parser_search.add_argument('--show-publisher', action='store_true',
        default=config.getboolean('search', 'show-publisher', fallback=False),
        help='if set additional information about the publisher is shown')
    parser_search.add_argument('--show-url', action='store_true',
        default=config.getboolean('search', 'show-url', fallback=False),
        help='if set a URL to the document is shown')
    allowed_sort_types=['score', 'updated', 'deposited', 'indexed', 'published']
    parser_search.add_argument('--sort', type=str, choices=allowed_sort_types,
        default=config.get('search', 'sort', fallback='score'),
        help='sorting of search queries; allowed values are {}'\
            .format(", ".join(allowed_sort_types)), metavar='')
    parser_search.add_argument('--order', type=str,
        choices=['asc', 'desc'],
        default=config.get('search', 'order', fallback='desc'),
        help='ordering of search queries')
    parser_search.add_argument('--year', type=int,
        default=config.getint('search', 'year', fallback=None),
        help='limit the year')
    parser_search.add_argument('--rows', type=int,
        default=config.getint('search', 'rows', fallback=20),
        help='number of rows to load')
    parser_search.add_argument('--color', action="store_true",
        default=config.getboolean('search', 'color', fallback=False),
        help='if set, colored output is used')
    valid_colors = ['black', 'cyan', 'magenta', 'yellow', 'blue', 'green',
            'red', 'white']
    parser_search.add_argument('--color-doi', type=str,
        default=config.get('search', 'color-doi', fallback='red'),
        choices=valid_colors, help='color for DOIs')
    parser_search.add_argument('--color-title', type=str,
        default=config.get('search', 'color-title', fallback='green'),
        choices=valid_colors, help='color for titles')
    parser_search.add_argument('--color-more', type=str,
        default=config.get('search', 'color-more', fallback='blue'),
        choices=valid_colors, help='color for additional information such as \
authors, URLs, etc.')

    # receive allowed types via http://api.crossref.org/types
    allowed_types = api.get_valid_types()
    parser_search.add_argument('--type', type=str, choices=allowed_types,
        default=config.get('search', 'type', fallback=None),
        help='selects a single type; allowed values are {}'.format(", "\
            .join(allowed_types)),
        metavar='')
    parser_search.set_defaults(which_parser='search')

    parser_cite = subparsers.add_parser('cite',
        help='Cite article based on DOI in different citation formats', 
        description="""Cite articles with a known DOI. Formatting can be done
using the `style`-parameter and supports hundreds of different citation
formats. A full list of supported formats can be found in the subfolder
`API/styles.txt`. The most common ones are `apa` and `bibtex`.""")
    parser_cite.add_argument('identifier', type=str, help='DOI identifier')
    parser_cite.add_argument('-s', '--style', type=str,
        default=config.get('cite', 'style', fallback="bibtex"),
        help='Citation style')
    parser_cite.add_argument('-c', '--copy', action='store_true',
        default=config.get('cite', 'copy', fallback=False),
        help="""Copies the result to the system clipboard""")
    parser_cite.set_defaults(which_parser='cite')

    parser_download = subparsers.add_parser('download',
        help='Download articles based on their DOI', 
        description="""Downloads articles, if a full text verison is provided
by the authors.""")
    parser_download.add_argument('identifier', type=str, help='DOI identifier')
    parser_download.add_argument('-d', '--destination', type=str,
        default=config.get('download', 'destination', fallback="."),
        help='download destination')
    parser_download.set_defaults(which_parser='download')

    parser_bulk = subparsers.add_parser('bulk',
        help='Mass converting for multiple DOIs listed in a single file.',
        description="""Mass converting for multiple DOIs listed in a single file.""")
    parser_bulk.add_argument('input', type=argparse.FileType('r'), 
        help='input file path', nargs='?', default=sys.stdin)
    parser_bulk.add_argument('output', type=argparse.FileType('w'),
        help='output file path', nargs='?', default=sys.stdout)
    parser_bulk.add_argument('-s', '--style', type=str,
        default=config.get('bulk', 'style', fallback="bibtex"),
        help='Citation style')
    parser_bulk.set_defaults(which_parser='bulk')

    parser_service = subparsers.add_parser('service',
        help='Provices service functions for the API such as rebuilding the \
database of valid types and styles', 
        description="""Provices service functions for the API such as
rebuilding the database of valid types and styles""")
    parser_service.add_argument('--rebuild-api-types', action='store_true',
            help='Rebuild the types, that are accepted on API requests')
    parser_service.add_argument('--rebuild-api-styles', action='store_true',
            help='Rebuild the styles, that are accepted on API requests')
    parser_service.set_defaults(which_parser='service')

    parser.add_argument('-q', '--quiet', action='store_true', 
        default=config.getboolean('general', 'quiet', fallback=False),
        help='turns off all unnecessary outputs; use this for scripting')
    parser.add_argument('--log-level', type=str, choices=['info', 'debug'],
        default=config.get('general', 'log-level', fallback="info"),
        help='set the logging level')
    parser.add_argument('--version', action="store_true",
        help='shows the version of doimgr')

    args = parser.parse_args()

    if args.version:
        print("doimgr version: {}".format(__version__))
        sys.exit()

    # set the logging levels according to the users choice
    if args.quiet:
        level = logging.CRITICAL
    else:
        level = logging.INFO
        if args.log_level == 'debug':
            level = logging.DEBUG
    logging.basicConfig(level=level)

    logging.debug("doimgr version {}".format(__version__))

    if hasattr(args, 'which_parser'):
        if args.which_parser == 'search':
            logging.debug('Arguments match to perform search')
            req = Request()
            if sys.stdout.isatty():
                # only allow colors when the script's output is not redirected
                req.set_colored_output(args.color, doi=args.color_doi,
                        title=args.color_title, more=args.color_more)
            else:
                logging.debug('Colors have been disabled due to detected \
reconnect')
            results = req.search(req.prepare_search_query(args.query,
                args.sort, args.order, args.year, args.type, args.rows))
            req.print_search_content(results, args.show_authors,
                    args.show_type, args.show_publisher, args.show_url)

        elif args.which_parser == 'cite':
            logging.debug('Arguments match to request single DOI')

            # check if given style is valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            req = Request()
            result = req.citation(req.prepare_citation_query(args.identifier),
                    style=args.style)
            req.print_citation(result)
            if args.copy:
                Clipboard.copy_to(result)

        elif args.which_parser == 'download':
            logging.debug('Arguments match to download single DOI')

            try:
                os.makedirs(os.path.expanduser(args.destination))
                logging.debug("Destination dir {} created.".format(
                    args.destination))
            except FileExistsError:
                logging.debug("Destination dir {} does already exists".format(
                    args.destination))

            req = Request()
            links = req.get_download_links(args.identifier)
            for link in links:
                url = link.get_url()
                d = Downloader()
                filepath = d.download(url, 
                    os.path.expanduser(args.destination),
                    "{}.pdf".format(args.identifier.replace("/", "_")))
                if filepath is not None:
                    logging.info("Saved file as {}".format(filepath))

            if len(links) == 0:
                logging.info("No valid download URLs found. Aborting.")

        elif args.which_parser == 'bulk':
            logging.debug('Arguments match with bulk conversion')

            # check if given style valid
            # this is not done via argparse directly due to the amount of
            # possible parameters
            styles = api.get_valid_styles()
            if args.style not in styles:
                raise ValueError("Given style \"{}\" is not valid. \
    Aborting.".format(args.style))

            b = BulkConverter()
            if args.output == sys.stdout:
                # switch to quiet mode, since we do not want to place
                # unneccesary messages on stdout
                logging.getLogger().setLevel(logging.CRITICAL)
            b.run(args.input, args.output, style=args.style)

        elif args.which_parser == 'service':
            logging.debug('Arguments match with service call')

            if args.rebuild_api_types:
                api.rebuild_valid_identifier(api.TYPE_TYPES)

            if args.rebuild_api_styles:
                api.rebuild_valid_identifier(api.TYPE_STYLES)

Example #10

Show file

File: csv_process.py Project: SilenceEagle/paper_downloader

def download_from_csv(postfix,
                      save_dir,
                      csv_file_path,
                      is_download_main_paper=True,
                      is_download_supplement=True,
                      time_step_in_seconds=5,
                      total_paper_number=None,
                      downloader='IDM'):
    """
    download paper and supplement files and save them to save_dir/main_paper and save_dir/supplement
        respectively
    :param postfix: str, postfix that will be added at the end of papers' title
    :param save_dir: str, paper and supplement material's save path
    :param csv_file_path: str, the full path to csv file
    :param is_download_main_paper: bool, True for downloading main paper
    :param is_download_supplement: bool, True for downloading supplemental material
    :param time_step_in_seconds: int, the interval time between two downloading request in seconds
    :param total_paper_number: int, the total number of papers that is going to download
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'.
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    if not os.path.exists(csv_file_path):
        raise ValueError(f'ERROR: file not found in {csv_file_path}!!!')

    main_save_path = os.path.join(save_dir, 'main_paper')
    if is_download_main_paper:
        os.makedirs(main_save_path, exist_ok=True)
    if is_download_supplement:
        supplement_save_path = os.path.join(save_dir, 'supplement')
        os.makedirs(supplement_save_path, exist_ok=True)

    error_log = []
    with open(csv_file_path, newline='') as csvfile:
        myreader = csv.DictReader(csvfile, delimiter=',')
        pbar = tqdm(myreader)
        i = 0
        for this_paper in pbar:
            is_grouped = ('group' in this_paper)
            i += 1
            # get title
            if is_grouped:
                group = slugify(this_paper['group'])
            title = slugify(this_paper['title'])
            if total_paper_number is not None:
                pbar.set_description(
                    f'Downloading paper {i}/{total_paper_number}')
            else:
                pbar.set_description(f'Downloading paper {i}')
            this_paper_main_path = os.path.join(main_save_path,
                                                f'{title}_{postfix}.pdf')
            if is_grouped:
                this_paper_main_path = os.path.join(main_save_path, group,
                                                    f'{title}_{postfix}.pdf')
            if is_download_supplement:
                this_paper_supp_path_no_ext = os.path.join(
                    supplement_save_path, f'{title}_{postfix}_supp.')
                if is_grouped:
                    this_paper_supp_path_no_ext = os.path.join(
                        supplement_save_path, group,
                        f'{title}_{postfix}_supp.')
                if '' != this_paper['supplemental link'] and os.path.exists(this_paper_main_path) and \
                        (os.path.exists(this_paper_supp_path_no_ext + 'zip') or os.path.exists(
                            this_paper_supp_path_no_ext + 'pdf')):
                    continue
                elif '' == this_paper['supplemental link'] and os.path.exists(
                        this_paper_main_path):
                    continue
            elif os.path.exists(this_paper_main_path):
                continue
            if 'error' == this_paper['main link']:
                error_log.append((title, 'no MAIN link'))
            elif '' != this_paper['main link']:
                if is_grouped:
                    if is_download_main_paper:
                        os.makedirs(os.path.join(main_save_path, group),
                                    exist_ok=True)
                    if is_download_supplement:
                        os.makedirs(os.path.join(supplement_save_path, group),
                                    exist_ok=True)
                if is_download_main_paper:
                    try:
                        # download paper with IDM
                        if not os.path.exists(this_paper_main_path):
                            downloader.download(
                                urls=this_paper['main link'].replace(
                                    ' ', '%20'),
                                save_path=os.path.join(os.getcwd(),
                                                       this_paper_main_path),
                                time_sleep_in_seconds=time_step_in_seconds)
                    except Exception as e:
                        # error_flag = True
                        print('Error: ' + title + ' - ' + str(e))
                        error_log.append((title, this_paper['main link'],
                                          'main paper download error', str(e)))
                # download supp
                if is_download_supplement:
                    # check whether the supp can be downloaded
                    if not (os.path.exists(this_paper_supp_path_no_ext + 'zip')
                            or os.path.exists(this_paper_supp_path_no_ext +
                                              'pdf')):
                        if 'error' == this_paper['supplemental link']:
                            error_log.append((title, 'no SUPPLEMENTAL link'))
                        elif '' != this_paper['supplemental link']:
                            supp_type = this_paper['supplemental link'].split(
                                '.')[-1]
                            try:
                                downloader.download(
                                    urls=this_paper['supplemental link'],
                                    save_path=os.path.join(
                                        os.getcwd(),
                                        this_paper_supp_path_no_ext +
                                        supp_type),
                                    time_sleep_in_seconds=time_step_in_seconds)
                            except Exception as e:
                                # error_flag = True
                                print('Error: ' + title + ' - ' + str(e))
                                error_log.append(
                                    (title, this_paper['supplemental link'],
                                     'supplement download error', str(e)))

        # 2. write error log
        print('write error log')
        with open('..\\log\\download_err_log.txt', 'w') as f:
            for log in tqdm(error_log):
                for e in log:
                    if e is not None:
                        f.write(e)
                    else:
                        f.write('None')
                    f.write('\n')

                f.write('\n')

    return True

Example #11

Show file

def download_paper(year,
                   save_dir,
                   is_download_supplement=True,
                   time_step_in_seconds=5,
                   downloader='IDM'):
    """
    download all ICML paper and supplement files given year, restore in save_dir/main_paper and save_dir/supplement
    respectively
    :param year: int, ICML year, such 2019
    :param save_dir: str, paper and supplement material's save path
    :param is_download_supplement: bool, True for downloading supplemental material
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    ICML_year_dict = {
        2021: 139,
        2020: 119,
        2019: 97,
        2018: 80,
        2017: 70,
        2016: 48,
        2015: 37,
        2014: 32,
        2013: 28
    }
    if year >= 2013:
        init_url = f'http://proceedings.mlr.press/v{ICML_year_dict[year]}/'
    elif year == 2012:
        init_url = 'https://icml.cc/2012/papers.1.html'
    elif year == 2011:
        init_url = 'http://www.icml-2011.org/papers.php'
    elif 2009 == year:
        init_url = 'https://icml.cc/Conferences/2009/abstracts.html'
    elif 2008 == year:
        init_url = 'http://www.machinelearning.org/archive/icml2008/abstracts.shtml'
    elif 2007 == year:
        init_url = 'https://icml.cc/Conferences/2007/paperlist.html'
    elif year in [2006, 2004, 2005]:
        init_url = f'https://icml.cc/Conferences/{year}/proceedings.html'
    elif 2003 == year:
        init_url = 'https://aaai.org/Library/ICML/icml03contents.php'
    else:
        raise ValueError('''the given year's url is unknown !''')

    postfix = f'ICML_{year}'
    if os.path.exists(f'..\\urls\\init_url_icml_{year}.dat'):
        with open(f'..\\urls\\init_url_icml_{year}.dat', 'rb') as f:
            content = pickle.load(f)
    else:
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
        }
        req = urllib.request.Request(url=init_url, headers=headers)
        content = urllib.request.urlopen(req).read()
        # content = open(f'..\\ICML_{year}.html', 'rb').read()
        with open(f'..\\urls\\init_url_icml_{year}.dat', 'wb') as f:
            pickle.dump(content, f)
    # soup = BeautifulSoup(content, 'html.parser')
    soup = BeautifulSoup(content, 'html5lib')
    # soup = BeautifulSoup(open(r'..\ICML_2011.html', 'rb'), 'html.parser')
    error_log = []
    if year >= 2013:
        if year in ICML_year_dict.keys():
            volume = f'v{ICML_year_dict[year]}'
        else:
            raise ValueError('''the given year's url is unknown !''')

        pmlr.download_paper_given_volume(
            volume=volume,
            save_dir=save_dir,
            postfix=postfix,
            is_download_supplement=is_download_supplement,
            time_step_in_seconds=time_step_in_seconds,
            downloader=downloader.downloader)
    elif 2012 == year:  # 2012
        # base_url = f'https://icml.cc/{year}/'
        paper_list_bar = tqdm(soup.find_all('div', {'class': 'paper'}))
        paper_index = 0
        for paper in paper_list_bar:
            paper_index += 1
            title = ''
            title = slugify(paper.find('h2').text)
            link = None
            for a in paper.find_all('a'):
                if 'ICML version (pdf)' == a.text:
                    link = urllib.parse.urljoin(init_url, a.get('href'))
                    break
            if link is not None:
                this_paper_main_path = os.path.join(
                    save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                paper_list_bar.set_description(
                    f'find paper {paper_index}:{title}')
                if not os.path.exists(this_paper_main_path):
                    paper_list_bar.set_description(
                        f'downloading paper {paper_index}:{title}')
                    downloader.download(
                        urls=link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
            else:
                error_log.append((title, 'no main link error'))
    elif 2011 == year:
        paper_list_bar = tqdm(soup.find_all('a'))
        paper_index = 0
        for paper in paper_list_bar:
            h3 = paper.find('h3')
            if h3 is not None:
                title = slugify(h3.text)
                paper_index += 1
            if 'download' == slugify(paper.text.strip()):
                link = paper.get('href')
                link = urllib.parse.urljoin(init_url, link)
                if link is not None:
                    this_paper_main_path = os.path.join(
                        save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                    paper_list_bar.set_description(
                        f'find paper {paper_index}:{title}')
                    if not os.path.exists(this_paper_main_path):
                        paper_list_bar.set_description(
                            f'downloading paper {paper_index}:{title}')
                        downloader.download(
                            urls=link,
                            save_path=this_paper_main_path,
                            time_sleep_in_seconds=time_step_in_seconds)
                else:
                    error_log.append((title, 'no main link error'))
    elif year in [2009, 2008]:
        if 2009 == year:
            paper_list_bar = tqdm(
                soup.find('div', {
                    'id': 'right_column'
                }).find_all(['h3', 'a']))
        elif 2008 == year:
            paper_list_bar = tqdm(
                soup.find('div', {
                    'class': 'content'
                }).find_all(['h3', 'a']))
        paper_index = 0
        title = None
        for paper in paper_list_bar:
            if 'h3' == paper.name:
                title = slugify(paper.text)
                paper_index += 1
            elif 'full-paper' == slugify(paper.text.strip()):  # a
                link = paper.get('href')
                if link is not None and title is not None:
                    link = urllib.parse.urljoin(init_url, link)
                    this_paper_main_path = os.path.join(
                        save_dir, f'{title}_{postfix}.pdf')
                    paper_list_bar.set_description(
                        f'find paper {paper_index}:{title}')
                    if not os.path.exists(this_paper_main_path):
                        paper_list_bar.set_description(
                            f'downloading paper {paper_index}:{title}')
                        downloader.download(
                            urls=link,
                            save_path=this_paper_main_path,
                            time_sleep_in_seconds=time_step_in_seconds)
                    title = None
                else:
                    error_log.append((title, 'no main link error'))
    elif year in [2006, 2005]:
        paper_list_bar = tqdm(soup.find_all('a'))
        paper_index = 0
        for paper in paper_list_bar:
            title = slugify(paper.text.strip())
            link = paper.get('href')
            paper_index += 1
            if link is not None and title is not None and (
                    'pdf' == link[-3:] or 'ps' == link[-2:]):
                link = urllib.parse.urljoin(init_url, link)
                this_paper_main_path = os.path.join(
                    save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                paper_list_bar.set_description(
                    f'find paper {paper_index}:{title}')
                if not os.path.exists(this_paper_main_path):
                    paper_list_bar.set_description(
                        f'downloading paper {paper_index}:{title}')
                    downloader.download(
                        urls=link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
    elif 2004 == year:
        paper_index = 0
        paper_list_bar = tqdm(
            soup.find('table', {
                'class': 'proceedings'
            }).find_all('tr'))
        title = None
        for paper in paper_list_bar:
            tr_class = None
            try:
                tr_class = paper.get('class')[0]
            except:
                pass
            if 'proc_2004_title' == tr_class:  # title
                title = slugify(paper.text.strip())
                paper_index += 1
            else:
                for a in paper.find_all('a'):
                    if '[Paper]' == a.text:
                        link = a.get('href')
                        if link is not None and title is not None:
                            link = urllib.parse.urljoin(init_url, link)
                            this_paper_main_path = os.path.join(
                                save_dir,
                                f'{title}_{postfix}.pdf'.replace(' ', '_'))
                            paper_list_bar.set_description(
                                f'find paper {paper_index}:{title}')
                            if not os.path.exists(this_paper_main_path):
                                paper_list_bar.set_description(
                                    f'downloading paper {paper_index}:{title}')
                                downloader.download(
                                    urls=link,
                                    save_path=this_paper_main_path,
                                    time_sleep_in_seconds=time_step_in_seconds)
                        break
    elif 2003 == year:
        paper_index = 0
        paper_list_bar = tqdm(
            soup.find('div', {
                'id': 'content'
            }).find_all('p', {'class': 'left'}))
        for paper in paper_list_bar:
            abs_link = None
            title = None
            link = None
            for a in paper.find_all('a'):
                abs_link = urllib.parse.urljoin(init_url, a.get('href'))
                if abs_link is not None:
                    title = slugify(a.text.strip())
                    break
            if title is not None:
                paper_index += 1
                this_paper_main_path = os.path.join(
                    save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_'))
                paper_list_bar.set_description(
                    f'find paper {paper_index}:{title}')
                if not os.path.exists(this_paper_main_path):
                    if abs_link is not None:
                        headers = {
                            'User-Agent':
                            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
                        }
                        req = urllib.request.Request(url=abs_link,
                                                     headers=headers)
                        for i in range(3):
                            try:
                                abs_content = urllib.request.urlopen(
                                    req, timeout=10).read()
                                break
                            except Exception as e:
                                if i == 2:
                                    print('error' + title + str(e))
                                    error_log.append(
                                        (title, abs_link, 'download error',
                                         str(e)))
                        abs_soup = BeautifulSoup(abs_content, 'html5lib')
                        for a in abs_soup.find_all('a'):
                            try:
                                if 'pdf' == a.get('href')[-3:]:
                                    link = urllib.parse.urljoin(
                                        abs_link, a.get('href'))
                                    if link is not None:
                                        paper_list_bar.set_description(
                                            f'downloading paper {paper_index}:{title}'
                                        )
                                        downloader.download(
                                            urls=link,
                                            save_path=this_paper_main_path,
                                            time_sleep_in_seconds=
                                            time_step_in_seconds)
                                    break
                            except:
                                pass

    # write error log
    print('write error log')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')

            f.write('\n')

Example #12

Show file

File: pmlr.py Project: SilenceEagle/paper_downloader

def download_paper_given_volume(volume,
                                save_dir,
                                postfix,
                                is_download_supplement=True,
                                time_step_in_seconds=5,
                                downloader='IDM'):
    """
    download main and supplement papers from PMLR.
    :param volume: str, such as 'v1', 'r1'
    :param save_dir: str, paper and supplement material's save path
    :param postfix: str, the postfix will be appended to the end of papers' titles
    :param is_download_supplement: bool, True for downloading supplemental material
    :param time_step_in_seconds: int, the interval time between two downlaod request in seconds
    :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'
    :return: True
    """
    downloader = Downloader(downloader=downloader)
    init_url = f'http://proceedings.mlr.press/{volume}/'

    if is_download_supplement:
        main_save_path = os.path.join(save_dir, 'main_paper')
        supplement_save_path = os.path.join(save_dir, 'supplement')
        os.makedirs(main_save_path, exist_ok=True)
        os.makedirs(supplement_save_path, exist_ok=True)
    else:
        main_save_path = save_dir
        os.makedirs(main_save_path, exist_ok=True)
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }
    req = urllib.request.Request(url=init_url, headers=headers)
    content = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(content, 'html.parser')
    paper_list = soup.find_all('div', {'class': 'paper'})
    error_log = []
    title_list = []
    num_download = len(paper_list)
    pbar = tqdm(zip(paper_list, range(num_download)))
    for paper in pbar:
        # get title
        this_paper = paper[0]
        title = slugify(this_paper.find_all('p', {'class': 'title'})[0].text)
        try:
            pbar.set_description(
                f'Downloading paper {paper[1] + 1}/{num_download}: {title}')
        except:
            pbar.set_description(
                f'''Downloading paper {paper[1] + 1}/{num_download}: {title.encode('utf8')}'''
            )
        title_list.append(title)

        this_paper_main_path = os.path.join(main_save_path,
                                            f'{title}_{postfix}.pdf')
        if is_download_supplement:
            this_paper_supp_path = os.path.join(supplement_save_path,
                                                f'{title}_{postfix}_supp.pdf')
            this_paper_supp_path_no_ext = os.path.join(
                supplement_save_path, f'{title}_{postfix}_supp.')

            if os.path.exists(this_paper_main_path) and os.path.exists(
                    this_paper_supp_path):
                continue
        else:
            if os.path.exists(this_paper_main_path):
                continue

        # get abstract page url
        links = this_paper.find_all('p', {'class': 'links'})[0].find_all('a')
        supp_link = None
        main_link = None
        for link in links:
            if 'Download PDF' == link.text or 'pdf' == link.text:
                main_link = link.get('href')
            elif is_download_supplement and (
                    'Supplementary PDF' == link.text or 'Supplementary Material' == link.text or \
                    'supplementary' == link.text):
                supp_link = link.get('href')
                if supp_link[-3:] != 'pdf':
                    this_paper_supp_path = this_paper_supp_path_no_ext + supp_link[
                        -3:]

        # try 1 time
        # error_flag = False
        for d_iter in range(1):
            try:
                # download paper with IDM
                if not os.path.exists(
                        this_paper_main_path) and main_link is not None:
                    downloader.download(
                        urls=main_link,
                        save_path=this_paper_main_path,
                        time_sleep_in_seconds=time_step_in_seconds)
            except Exception as e:
                # error_flag = True
                print('Error: ' + title + ' - ' + str(e))
                error_log.append(
                    (title, main_link, 'main paper download error', str(e)))
            # download supp
            if is_download_supplement:
                # check whether the supp can be downloaded
                if not os.path.exists(
                        this_paper_supp_path) and supp_link is not None:
                    try:
                        downloader.download(
                            urls=supp_link,
                            save_path=this_paper_supp_path,
                            time_sleep_in_seconds=time_step_in_seconds)
                    except Exception as e:
                        # error_flag = True
                        print('Error: ' + title + ' - ' + str(e))
                        error_log.append(
                            (title, supp_link, 'supplement download error',
                             str(e)))

    # write error log
    print('writing error log...')
    with open('..\\log\\download_err_log.txt', 'w') as f:
        for log in tqdm(error_log):
            for e in log:
                if e is not None:
                    f.write(e)
                else:
                    f.write('None')
                f.write('\n')
            f.write('\n')

    return True