Example #1
0
    def get_latest_build_date(self):
        """ Returns date of latest available nightly build."""
        if self.application not in ('fennec'):
            url = urljoin(self.base_url, 'nightly', 'latest-%s/' % self.branch)
        else:
            url = urljoin(self.base_url, 'nightly', 'latest-%s-%s/' %
                          (self.branch, self.platform))

        self.logger.info('Retrieving the build status file from %s' % url)
        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)
        parser.entries = parser.filter(r'.*%s\.txt' % self.platform_regex)
        if not parser.entries:
            message = 'Status file for %s build cannot be found' % \
                self.platform_regex
            raise errors.NotFoundError(message, url)

        # Read status file for the platform, retrieve build id,
        # and convert to a date
        headers = {'Cache-Control': 'max-age=0'}

        r = requests.get(url + parser.entries[-1],
                         auth=self.authentication, headers=headers)
        try:
            r.raise_for_status()

            return datetime.strptime(r.text.split('\n')[0], '%Y%m%d%H%M%S')
        finally:
            r.close()
def get_info(url, cw=None):
    print_ = get_print(cw)
    info = {}

    html = downloader.read_html(url)
    if '"cafe_cautionpage"' in html:
        raise errors.LoginRequired()
    url_article = re.find(r'''//cafe\.naver\.com/ArticleRead\.nhn\?articleid=[0-9]+&clubid=[0-9]+''', html, err='no iframe')
    url_article = urljoin(url, url_article)

    print_(url_article)

    articleid = re.find(r'articleid=([0-9]+)', url_article)
    clubid = re.find(r'clubid=([0-9]+)', url_article)
    url_api = f'https://apis.naver.com/cafe-web/cafe-articleapi/v2/cafes/{clubid}/articles/{articleid}?query=&useCafeId=true&requestFrom=A'

    j = downloader.read_json(url_api, url)

    info['title'] = j['result']['article']['subject']
    info['cafename'] = j['result']['cafe']['url']
    info['cafeid'] = clubid
    info['id'] = articleid
    
    html_content = j['result']['article']['contentHtml']
    soup = Soup(html_content)

    imgs = []

    pairs = []

    for video in soup.findAll('span', class_='_naverVideo'):
        vid = video.attrs['vid']
        key = video.attrs['key']
        pairs.append((vid, key))

    for script in soup.findAll('script', class_='__se_module_data'):
        data_raw = script['data-module']
        data = json.loads(data_raw)['data']
        vid = data.get('vid')
        if not vid:
            continue
        key = data['inkey']
        pairs.append((vid, key))

    for vid, key in pairs:
        url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key)
        data_raw = downloader.read_html(url_api)
        data = json.loads(data_raw)
        fs = data['videos']['list']
        fs = sorted(fs, key=lambda f: f['size'], reverse=True)
        video = Image(fs[0]['source'], url_article, len(imgs))
        imgs.append(video)
        
    for img in soup.findAll('img'):
        img = Image(urljoin(url_article, img['src']), url, len(imgs))
        imgs.append(img)

    info['imgs'] = imgs

    return info
Example #3
0
def fix_url(url, session=None, cw=None):
    print_ = get_print(cw)
    if '&manga_name=' not in url:
        return url
    print_('fix url')
    qs = query_url(url)
    name = qs['manga_name'][0].replace('+', ' ')
    url_search = urljoin(url, '/bbs/search.php')
    url_search = update_url_query(url_search, {'stx': [name]})
    print(url_search)
    html = read_html(url_search, session=session)
    soup = Soup(html)
    posts = soup.findAll('div', class_='post-row')
    print_(('posts:').format(len(posts)))
    if len(posts) != 1:
        return url
    for a in posts[0].findAll('a'):
        href = urljoin(url, a.attrs['href'])
        if 'manga_detail' in href:
            break
    else:
        raise Exception('Failed to find link')

    if cw is not None:
        cw.gal_num = href
    return href
Example #4
0
def get_imgs(url, soup=None, name=None):
    if soup is None:
        html = downloader.read_html(url)
        soup = Soup(html)

    view = soup.findAll('div', class_='rootContant')[:2][-1]

    v = view.find('video')
    if v:
        img = v.find('source').attrs['src']
        img = urljoin(url, img)
        img = Image(img, url, 0, 'video')
        ext = os.path.splitext(img.url().split('?')[0])[1]
        img.filename = u'{}{}'.format(name, ext)
        return [img]

    imgs = []
    for img in view.findAll('img'):
        img = img.attrs['dataurl']
        img = urljoin(url, img)
        img = re.sub('/[a-z]+images/', '/images/', img).replace('_t.', '.')
        img = Image(img, url, len(imgs))
        imgs.append(img)

    return imgs
def get_files(url, session, multi_post=False, cw=None):
    print_ = get_print(cw)
    html = read_html(url, session=session)
    soup = Soup(html)
    h = soup.find('h1', class_='title')
    content = h.parent.parent.parent
    title = h.text.strip()
    youtube = content.find('div', class_='embedded-video')
    video = content.find('video')
    if youtube:
        type = 'youtube'
    elif video:
        type = 'video'
    else:
        type = 'image'
    print_(('type: {}').format(type))
    files = []
    if type == 'image':
        urls = set()
        for img in content.findAll('img'):
            img = urljoin(url, img.parent.attrs['href'])
            if '/files/' not in img:
                continue
            if img in urls:
                print('duplicate')
                continue
            urls.add(img)
            file = File(type,
                        img,
                        title,
                        url,
                        len(files),
                        multi_post=multi_post)
            files.append(file)

    elif type == 'youtube':
        src = urljoin(url, youtube.find('iframe').attrs['src'])
        file = File(type, src, title, url)
        files.append(file)
    elif type == 'video':
        url_thumb = urljoin(url, video.attrs['poster'])
        print('url_thumb:', url_thumb)
        id = re.find('videos/([0-9a-zA-Z_-]+)', url, err='no video id')
        url_data = urljoin(url, '/api/video/{}'.format(id))
        s_json = read_html(url_data, url, session=session)
        data = json.loads(s_json)
        video = data[0]
        url_video = urljoin(url, video['uri'])
        file = File(type, url_video, title, url)
        file.url_thumb = url_thumb
        file.thumb = BytesIO()
        downloader.download(url_thumb, buffer=file.thumb, referer=url)
        files.append(file)
    else:
        raise NotImplementedError(type)
    return files
Example #6
0
def get_imgs(page, cw=None):
    print_ = get_print(cw)
    html = downloader.read_html(page.url)
    soup = Soup(html)

    type_ = re.find('''webtoonType *: *['"](.+?)['"]''', html)
    print_('type: {}'.format(type_))

    imgs = []
    if type_ == 'DEFAULT':  # https://m.comic.naver.com/webtoon/detail.nhn?titleId=715772
        view = soup.find('div', class_='toon_view_lst')
        for img in view.findAll('img'):
            img = img.attrs.get('data-src')
            if not img:
                continue
            img = urljoin(page.url, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)
    elif type_ == 'CUTTOON':  # https://m.comic.naver.com/webtoon/detail.nhn?titleId=752803
        view = soup.find('div', class_='swiper-wrapper')
        for div in view.findAll('div', class_='swiper-slide'):
            if div.parent != view:
                continue
            if div.find('div', class_='cut_viewer_last'):
                print('cut_viewer_last')
                continue
            if div.find('div', class_='cut_viewer_recomm'):
                print('cut_viewer_recomm')
                continue
            img = div.find('img')
            img = img.attrs['data-src']
            img = urljoin(page.url, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)
    elif type_ == 'EFFECTTOON':  #2313; https://m.comic.naver.com/webtoon/detail.nhn?titleId=670144
        img_base = re.find('''imageUrl *: *['"](.+?)['"]''', html) + '/'
        print('img_base:', img_base)
        url_api = re.find('''documentUrl *: *['"](.+?)['"]''', html)
        data_raw = downloader.read_html(url_api, page.url)
        data = json.loads(data_raw)
        for img in data['assets']['stillcut'].values(
        ):  # ordered in python3.7+
            img = urljoin(img_base, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)
    else:
        _imgs = re.findall('sImageUrl *: *[\'"](.+?)[\'"]', html)
        if not _imgs:
            raise Exception('no imgs')
        for img in _imgs:
            img = urljoin(page.url, img)
            img = Image(img, page, len(imgs))
            imgs.append(img)

    return imgs
Example #7
0
    def path_regex(self):
        """Return the regex for the path"""

        try:
            path = urljoin(self.monthly_build_list_regex,
                           self.builds[self.build_index])
            return path
        except:
            folder = urljoin(self.base_url, self.monthly_build_list_regex)
            raise NotFoundError("Specified sub folder cannot be found",
                                folder)
def read_channel(url_page, cw=None):
    print_ = get_print(cw)
    res = re.find(CHANNEL_PATTERN, url_page)
    if res is None:
        raise Exception('Not channel')
    header, username = res
    print(header, username)
    max_pid = get_max_range(cw)
    info = {}
    info['header'] = header
    info['username'] = username
    session = Session()
    urls = []
    ids = set()
    for p in range(100):
        url_api = urljoin(url_page,
                          '/{}/{}/videos/best/{}'.format(header, username, p))
        print_(url_api)
        r = session.post(url_api)
        data = json.loads(r.text)

        videos = data.get('videos')  #4530
        if not videos:
            print_('empty')
            break

        for video in videos:
            id_ = video['id']
            if id_ in ids:
                print_('duplicate: {}'.format(id_))
                continue
            ids.add(id_)
            info['name'] = video['pn']
            urls.append(urljoin(url_page, video['u']))

        if len(urls) >= max_pid:
            break

        n = data['nb_videos']

        s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls))
        if cw:
            cw.setTitle(s)
        else:
            print(s)
        if len(ids) >= n:
            break
        sleep(1, cw)
    if not urls:
        raise Exception('no videos')
    info['urls'] = urls[:max_pid]
    return info
Example #9
0
    def path_regex(self):
        """Return the regex for the path"""

        try:
            path = urljoin(self.monthly_build_list_regex,
                           self.builds[self.build_index])
            if self.application in MULTI_LOCALE_APPLICATIONS \
                    and self.locale != 'multi':
                path = urljoin(path, self.locale)
            return path
        except:
            folder = urljoin(self.base_url, self.monthly_build_list_regex)
            raise NotFoundError("Specified sub folder cannot be found",
                                folder)
Example #10
0
    def path_regex(self):
        """Return the regex for the path"""

        try:
            path = urljoin(self.monthly_build_list_regex,
                           self.builds[self.build_index])
            if self.application in APPLICATIONS_MULTI_LOCALE \
                    and self.locale != 'multi':
                path = urljoin(path, self.locale)
            return path
        except:
            folder = urljoin(self.base_url, self.monthly_build_list_regex)
            raise errors.NotFoundError("Specified sub folder cannot be found",
                                       folder)
    def read(self):
        if '/video/' in self.url:
            res = clf2.solve(self.url, session=self.session, cw=self.cw)
            soup = Soup(res['html'])
            title = soup.find('h1', id='post_title').text.strip()
            self.title = title
            view = soup.find('div', id='post')
            video = view.find('video')
            src = video.find('source')['src']
            src = urljoin(self.url, src)
            video = Video(src, self.url, title, self.session)
            self.urls.append(video.url)
            self.single = True
            return
        
        if '/image/' not in self.url:
            raise NotImplementedError('Not a post')

        res = clf2.solve(self.url, session=self.session, cw=self.cw)
        soup = Soup(res['html'])
        title = soup.find('h2').text
        paginator = soup.find('div', id='paginator')
        pages = [self.url]
        for a in paginator.findAll('a'):
            href = a.get('href')
            if not href:
                continue
            href = urljoin(self.url, href)
            if href not in pages:
                pages.append(href)

        imgs = []
        for i, page in enumerate(pages):
            if page == self.url:
                soup_page =  soup
            else:
                soup_page = downloader.read_soup(page, session=self.session)
            view = soup_page.find('div', id='post')
            for img in view.findAll('img'):
                href = img.parent['href']
                href = urljoin(page, href)
                img = Image(href, page, len(imgs), self.session)
                imgs.append(img)
            self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages)))

        for img in imgs:
            self.urls.append(img.url)

        self.title = clean_title(title)
def read_channel(url_page, cw=None):
    print_ = get_print(cw)
    res = re.find(CHANNEL_PATTERN, url_page)
    if res is None:
        raise Exception('Not channel')
    header, username = res
    print(header, username)
    max_pid = get_max_range(cw, 2000)
    info = {}
    info['header'] = header
    info['username'] = username
    session = Session()
    urls = []
    urls_set = set()
    for p in range(100):
        url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p))
        print(url_api)
        r = session.post(url_api, data='main_cats=false')
        soup = Soup(r.text)
        thumbs = soup.findAll('div', class_='thumb-block')
        if not thumbs:
            print_('empty')
            break
        for thumb in thumbs:
            info['name'] = thumb.find('span', class_='name').text.strip()
            href = thumb.find('a')['href']
            href = urljoin(url_page, href)
            if href in urls_set:
                print_('duplicate: {}'.format(href))
                continue
            urls_set.add(href)
            urls.append(href)
        
        if len(urls) >= max_pid:
            break
        
        s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls))
        if cw:
            if not cw.alive:
                return
            cw.setTitle(s)
        else:
            print(s)
    if not urls:
        raise Exception('no videos')
    info['urls'] = urls[:max_pid]
    return info
        
Example #13
0
def get_pages(soup, url):
    pages = []
    hrefs = set()
    titles = set()
    for a in soup.findAll(lambda tag: tag.name == 'a' and '/viewer/stories/' in
                          tag.get('href', ''))[::-1]:
        href = urljoin(url, a.attrs['href'])
        if href in hrefs:
            continue
        hrefs.add(href)
        divs = a.findAll('div', recursive=False)
        if len(divs) < 2:
            continue
        right = divs[1]
        number = right.findAll('span')[0].text.strip()
        title = right.findAll('span')[1].text.strip()
        title = ' - '.join(x for x in [number, title] if x)
        if title in titles:
            title0 = title
            i = 2
            while title in titles:
                title = title0 + ' ({})'.format(i)
                i += 1
        titles.add(title)
        page = Page(href, title)
        pages.append(page)
    if not pages:
        raise Exception('no pages')

    return pages
Example #14
0
def get_video(url):
    html = downloader.read_html(url)
    soup = Soup(html)

    view = soup.find('div', id='player-container-fluid')
    src_best = None
    res_best = -1
    for source in view.findAll('source'):
        src = urljoin(url, source.attrs['src'])
        res = re.find('([0-9]+)p', source.attrs['title'])
        res = int(res) if res else 0
        if res > res_best:
            src_best = src
            res_best = res

    if src_best is None:
        raise Exception('No source')

    title = soup.find('h1').text.strip()
    id = soup.find('div', id='video').attrs['data-id']

    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']

    #src_best = downloader.real_url(src_best)

    video = Video(src_best, url_thumb, url, title, id)
    return video
def generic_iter_pages(start_url, page_parser_cls,
                       request_fn: RequestFunction) -> Iterator[Page]:
    next_url = start_url

    while next_url:
        logger.debug("Requesting page from: %s", next_url)
        response = request_fn(next_url)

        logger.debug("Parsing page response")
        parser = page_parser_cls(response)

        page = parser.get_page()

        # TODO: If page is actually an iterable calling len(page) might consume it
        logger.debug("Got %s raw posts from page", len(page))
        yield page

        logger.debug("Looking for next page URL")
        next_page = parser.get_next_page()
        if next_page:
            next_url = utils.urljoin(FB_MOBILE_BASE_URL, next_page)
            next_url = next_url.replace("&num_to_fetch=4", "&num_to_fetch=20")
        else:
            logger.info("Page parser did not find next page URL")
            next_url = None
 def get(self, referer):
     soup = downloader.read_soup(self._url, referer, session=self.session)
     div = soup.find('div', id='display_image_detail')
     url = urljoin(self._url, div.find('img').parent['href'])
     ext = get_ext(url)
     self.filename = '{:04}{}'.format(self._p, ext)
     return url, self._url
Example #17
0
    def get_build_info_for_index(self, build_index=None):
        url = urljoin(self.base_url, self.build_list_regex)

        self.logger.info('Retrieving list of builds from %s' % url)
        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)
        parser.entries = parser.filter(r'^\d+$')

        if self.timestamp:
            # If a timestamp is given, retrieve the folder with the timestamp
            # as name
            parser.entries = self.timestamp in parser.entries and \
                [self.timestamp]

        elif self.date:
            # If date is given, retrieve the subset of builds on that date
            parser.entries = filter(self.date_matches, parser.entries)

        if not parser.entries:
            message = 'No builds have been found'
            raise NotFoundError(message, url)

        self.show_matching_builds(parser.entries)

        # If no index has been given, set it to the last build of the day.
        if build_index is None:
            build_index = len(parser.entries) - 1

        return (parser.entries, build_index)
    def init(self):
        self.url = clean_url(self.url)
        self.session = Session()
        if re.search(PATTERN_ID, self.url):  #1799
            select = self.soup.find('select', class_='bookselect')
            for i, op in enumerate(select.findAll('option')[::-1]):
                if 'selected' in op.attrs:
                    break
            else:
                raise Exception('no selected option')
            for a in self.soup.findAll('a'):
                url = urljoin(self.url, a.get('href') or '')
                if re.search(PATTERN, url):
                    break
            else:
                raise Exception('list not found')
            self.url = self.fix_url(url)
            self._soup = None

            for i, page in enumerate(
                    get_pages(self.url, self.session, self.soup)):
                if page.id == int(op['value']):
                    break
            else:
                raise Exception('can not find page')
            self.cw.range_p = [i]
Example #19
0
    def path_regex(self):
        """Return the regex for the path to the build folder"""

        if self.locale_build:
            return self.build_list_regex

        return '%s/' % urljoin(self.build_list_regex, self.builds[self.build_index])
def get_info(url, soup=None):
    if soup is None:
        html = downloader.read_html(url)
        soup = Soup(html)

    info = {}

    info['title'] = soup.find('h1', id='workTitle').text.strip()
    info['artist'] = soup.find('span',
                               id='workAuthor-activityName').text.strip()

    desc = soup.find('section', id='description')
    button = desc.find('span', class_='ui-truncateTextButton-expandButton')
    if button:
        print('decompose button')
        button.decompose()
    catch = desc.find('span', id='catchphrase-body').text.strip()
    intro = desc.find('p', id='introduction').text.strip()
    desc = u'  {}\n\n\n{}'.format(catch, intro)
    info['description'] = desc

    pages = []
    for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'):
        href = urljoin(url, a.attrs['href'])
        subtitle = a.find('span',
                          class_='widget-toc-episode-titleLabel').text.strip()
        date = a.find('time',
                      class_='widget-toc-episode-datePublished').text.strip()
        page = Page(href, subtitle, date, len(pages) + 1)
        pages.append(page)

    info['pages'] = pages

    return info
Example #21
0
def get_imgs_page(page, session, cw=None):
    print_ = get_print(cw)
    print_(page.title)
    html = downloader.read_html(page.url, session=session)
    soup = Soup(html)

    view = soup.find('div', class_='chapter-content')

    if not view:
        raise Exception('no chapter-content')

    imgs = []
    for img in soup.findAll('img', class_='chapter-img'):
        src = img.get('data-pagespeed-lazy-src') or img.get(
            'data-src') or img.get('data-srcset') or img.get(
                'data-aload') or img['src']
        try:
            src = base64.b64decode(src).strip().decode('utf8')
        except:
            pass
        src = urljoin(page.url, src)
        if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src:
            continue
        if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src:
            continue
        if 'LoveHug_600cfd96e98ff.jpg' in src:
            continue
        img = Image(src.strip(), page, len(imgs))
        imgs.append(img)

    return imgs
def get_pages(url, soup=None, session=None):
    if soup is None:
        html = read_html(url, session=session, cw=None)
        soup = Soup(html)
    pagination = soup.find('div', class_='pagination')

    pages = []
    hrefs = set()
    for a in pagination.findAll('a'):
        href = a.attrs.get('href', '')
        href = urljoin(url, href)
        if not href.startswith(url):
            print('not match', href)
            continue
        while href.endswith('/'):
            href = href[:-1]
        if href in hrefs:
            print('duplicate', href)
            continue
        hrefs.add(href)
        text = a.text.strip()
        page = Page(text, href)
        pages.append(page)

    if url not in hrefs:
        page = Page('1', url, soup)
        pages.insert(0, page)

    return pages
 def _call(self, url_api, referer='https://twitter.com', params=None):
     url_api = urljoin('https://api.twitter.com', url_api)
     if params:
         url_api = update_url_query(url_api, params)
     #print('call:', url_api)
     data = downloader.read_json(url_api, referer, session=self.session)
     return data
Example #24
0
    def __init__(self, destination=None, platform=None,
                 application='firefox', locale=None, extension=None,
                 username=None, password=None,
                 retry_attempts=0, retry_delay=10.,
                 is_stub_installer=False, timeout=None,
                 log_level='INFO',
                 base_url=BASE_URL):

        # Private properties for caching
        self._filename = None
        self._binary = None

        self.destination = destination or os.getcwd()

        if not locale:
            if application in APPLICATIONS_MULTI_LOCALE:
                self.locale = 'multi'
            else:
                self.locale = 'en-US'
        else:
            self.locale = locale

        self.platform = platform or self.detect_platform()

        self.session = requests.Session()
        if (username, password) != (None, None):
            self.session.auth = (username, password)

        self.retry_attempts = retry_attempts
        self.retry_delay = retry_delay
        self.is_stub_installer = is_stub_installer
        self.timeout_download = timeout
        # this is the timeout used in requests.get. Unlike "auth",
        # it does not work if we attach it on the session, so we handle
        # it independently.
        self.timeout_network = 60.

        logging.basicConfig(format=' %(levelname)s | %(message)s')
        self.logger = logging.getLogger(self.__module__)
        self.logger.setLevel(log_level)
        logging.getLogger('redo').setLevel(logging.INFO)

        # build the base URL
        self.application = application
        self.base_url = '%s/' % urljoin(
            base_url,
            APPLICATIONS_TO_FTP_DIRECTORY.get(self.application, self.application)
        )

        if extension:
            self.extension = extension
        else:
            if self.application in APPLICATIONS_MULTI_LOCALE and \
                    self.platform in ('win32', 'win64'):
                # builds for APPLICATIONS_MULTI_LOCALE only exist in zip
                self.extension = 'zip'
            else:
                self.extension = DEFAULT_FILE_EXTENSIONS[self.platform]

        self._retry_check_404(self.get_build_info)
def get_info(url):
    url = downloader.real_url(url)
    q = re.find(r'/comic/([^/?]+)', url)

    url_api = 'https://nhentai.com/api/comics/{}'.format(q)
    data_raw = downloader.read_html(url_api, url)
    data = json.loads(data_raw)

    url_api = 'https://nhentai.com/api/comics/{}/images'.format(q)
    data_raw = downloader.read_html(url_api, url)
    data_images = json.loads(data_raw)

    info = {}
    info['url'] = url

    info['id'] = int(data['id'])
    info['type'] = data['category']['name']
    info['title'] = data['title']
    info['artists'] = [x['name'] for x in data['artists']]
    info['groups'] = [x['name'] for x in data['groups']]
    info['seriess'] = [x['name'] for x in data['parodies']]
    info['lang'] = data['language']['name']

    imgs = []
    for img in data_images['images']:
        img = urljoin(url, img['source_url'])
        img = Image(url, img, len(imgs))
        imgs.append(img)
    info['imgs'] = imgs

    return info
Example #26
0
def read_channel(url, type_, cw=None):
    print_ = get_print(cw)
    username = re.find(r'/users/([^/]+)', url, err='no username')
    info = {}
    urls = []
    urls_set = set()
    for p in range(50):
        url = 'https://ecchi.iwara.tv/users/{}/{}?page={}'.format(
            username, type_, p)
        print_(url)
        html = downloader.read_html(url)
        soup = Soup(html)
        if p == 0:
            title = soup.find('h1', class_='page-title').text
            info['title'] = title.replace("'s videos", '').strip()

        view = soup.find('div', class_='view-content')
        if view is None:
            break

        urls_new = []
        for div in view.findAll('div', class_='views-column'):
            href = div.find('a')['href']
            url_video = urljoin(url, href)
            if url_video in urls_set:
                continue
            urls_set.add(url_video)
            urls_new.append(url_video)
        if not urls_new:
            break
        urls += urls_new
    info['urls'] = urls
    return info
Example #27
0
    def get_build_info_for_date(self, date, has_time=False, build_index=None):
        url = urljoin(self.base_url, self.monthly_build_list_regex)

        self.logger.info('Retrieving list of builds from %s' % url)
        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)
        regex = r'%(DATE)s-(\d+-)+%(BRANCH)s%(L10N)s$' % {
            'DATE': date.strftime('%Y-%m-%d'),
            'BRANCH': self.branch,
            'L10N': '' if self.locale == 'en-US' else '(-l10n)?'}
        parser.entries = parser.filter(regex)
        parser.entries = parser.filter(self.is_build_dir)

        if has_time:
            # If a time is included in the date, use it to determine the
            # build's index
            regex = r'.*%s.*' % date.strftime('%H-%M-%S')
            parser.entries = parser.filter(regex)

        if not parser.entries:
            date_format = '%Y-%m-%d-%H-%M-%S' if has_time else '%Y-%m-%d'
            message = 'Folder for builds on %s has not been found' % \
                self.date.strftime(date_format)
            raise NotFoundError(message, url)

        # If no index has been given, set it to the last build of the day.
        self.show_matching_builds(parser.entries)
        if build_index is None:
            build_index = len(parser.entries) - 1

        return (parser.entries, build_index)
Example #28
0
def get_pages(url):
    pages = []
    urls = set()
    for p in range(1, 101):
        url_page = set_page(url, p)
        print(url_page)
        for try_ in range(4):
            try:
                soup = downloader.read_soup(url_page)
                view = soup.find('ul', id='_listUl')
                if view is None:
                    raise Exception('no view')
                break
            except Exception as e:
                e_ = e
                print(e)
        else:
            raise e_
        pages_new = []
        for li in view.findAll('li', recursive=False):
            href = urljoin(url, li.find('a')['href'])
            title = li.find('span', class_='subj').text.strip()
            if href in urls:
                continue
            urls.add(href)
            no = int(li['data-episode-no'])
            title = '{:04} - {}'.format(no, title)
            page = Page(href, title)
            pages_new.append(page)
        if not pages_new:
            break
        pages += pages_new
    return pages[::-1]
Example #29
0
    def get_build_info_for_index(self, build_index=None):
        url = urljoin(self.base_url, self.build_list_regex)

        self.logger.info('Retrieving list of builds from %s' % url)
        parser = self._create_directory_parser(url)
        parser.entries = parser.filter(r'^\d+$')

        if self.timestamp:
            # If a timestamp is given, retrieve the folder with the timestamp
            # as name
            parser.entries = self.timestamp in parser.entries and \
                [self.timestamp]

        elif self.date:
            # If date is given, retrieve the subset of builds on that date
            parser.entries = filter(self.date_matches, parser.entries)

        if not parser.entries:
            message = 'No builds have been found'
            raise errors.NotFoundError(message, url)

        self.show_matching_builds(parser.entries)

        # If no index has been given, set it to the last build of the day.
        if build_index is None:
            # Find the most recent non-empty entry.
            build_index = len(parser.entries)
            for build in reversed(parser.entries):
                build_index -= 1
                if not build_index or self.is_build_dir(build):
                    break

        self.logger.info('Selected build: %s' % parser.entries[build_index])

        return (parser.entries, build_index)
Example #30
0
def get_info(url, soup=None):
    if soup is None:
        html = downloader.read_html(url)
        soup = Soup(html)

    info = {}

    info['id'] = get_id(url)

    title = soup.find('h1').text.strip()
    info['title'] = title

    for tag in soup.findAll('span', class_='tag'):
        href = tag.parent.attrs['href']
        href = urljoin(url, href).strip('/')

        key = href.split('/')[3]
        value = href.split('/')[-1]

        if key == 'language' and value == 'translated':
            continue

        if key in info:
            info[key].append(value)
        else:
            info[key] = [value]

    for key in ['artists', 'groups', 'parodies', 'tags', 'characters']:
        if key not in info:
            info[key] = []

    return info
        def f(_):
            html = downloader.read_html(url, session=session)
            soup = Soup(html)

            box = soup.find('section', id='picBox')
            img = box.find('img')
            if img is None:
                raise Exception('No img')

            onclick = img.attrs.get('onclick', '')
            if onclick and '.src' in onclick:
                print('onclick', onclick)
                img = re.find('''.src *= *['"](.+?)['"]''', onclick)
            else:
                img = img.attrs['src']
            img = urljoin(url, img)

            filename = clean_title(os.path.basename(img.split('?')[0]))
            name, ext = os.path.splitext(filename)

            # https://www.hentai-foundry.com/pictures/user/DrGraevling/74069/Eversong-Interrogation-pg.-13
            if ext.lower() not in [
                    '.bmp', '.png', '.gif', '.jpg', '.jpeg', '.webp', '.webm',
                    '.avi', '.mp4', '.mkv', '.wmv'
            ]:
                filename = u'{}.jpg'.format(name)

            self.filename = filename
            return img
Example #32
0
    def downloadRawFile ( self , remote , local=None ) :
        """Downloads a remote file to the local system.

        remote - path relative to repository base
        local - Optional local name for the file

        Returns the local file name or False if errors"""

        remote = utils.urljoin( self.base_url() , remote ) 

        if not local :
            (handle, fname) = tempfile.mkstemp()
        else :
            fname = local
            handle = os.open( fname , os.O_WRONLY | os.O_TRUNC | os.O_CREAT )
        try:
            response = urllib2.urlopen( remote )
            data = response.read(256)
            while data :
                os.write(handle, data)
                data = response.read(256)
            os.close(handle)
        except Exception ,ex :
            repolib.logger.error( "Exception : %s" % ex )
            os.close(handle)
            os.unlink(fname)
            return False
Example #33
0
 def fix_url(cls, url):
     # 2377
     m = re.find(r'/board.php\?bo_table=([0-9a-zA-Z_]+)&wr_id=([0-9]+)',
                 url)
     if m:
         return urljoin(url, '/{}/{}'.format(*m))
     return url.split('?')[0]
Example #34
0
    def init(self):
        self.url = self.url.replace('manatoki_', '')

        self.session, self.soup, url = get_soup(self.url)
        self.url = self.fix_url(url)

        # 2377
        list = self.soup.find(attrs={'data-original-title': '목록'})
        if list:
            url = urljoin(self.url, list.parent['href'])
            nav = self.soup.find('div', class_='toon-nav')
            select = nav.find('select', {'name': 'wr_id'})
            for i, op in enumerate(select.findAll('option')[::-1]):
                if 'selected' in op.attrs:
                    break
            else:
                raise Exception('no selected option')
            self.session, self.soup, url = get_soup(url)
            self.url = self.fix_url(url)

            for i, page in enumerate(get_pages(self.url, self.soup)):
                if page.id == int(op['value']):
                    break
            else:
                raise Exception('can not find page')
            self.customWidget.range_p = [i]

        self.name
def read_playlist(url, n, cw=None):
    print_ = get_print(cw)
    for header in ['channel', 'user', 'c']:
        if '/{}/'.format(header) in url.lower():
            username = re.find(r'/{}/([^/\?]+)'.format(header), url,
                               re.IGNORECASE)
            url = urljoin(url, '/{}/{}/videos'.format(header, username))

    options = {
        'extract_flat': True,
        'playlistend': n,
    }
    ydl = ytdl.YoutubeDL(options)
    info = ydl.extract_info(url)

    es = info['entries']
    urls = []
    for e in es:
        href = 'https://www.youtube.com/watch?v={}'.format(e['id'])
        urls.append(href)
    info['urls'] = urls

    if 'uploader' not in info:
        title = info['title']
        if title.lower().endswith(' - videos'):
            title = title[:-len(' - videos')]
        info['uploader'] = title
        print_('⚠️ Fix uploader: None -> {}'.format(title))

    return info
Example #36
0
    def read(self):
        cw = self.customWidget

        title = self.get_title(self.url)

        ids = set()
        url = self.url
        while True:
            html = urlopen(url)
            soup = BeautifulSoup(html, "html.parser")
            tmp = soup.find_all(attrs={'class':'directlink'}, href=True)
            for image_html in tmp:
                image_url = image_html['href']
                id_ = self.get_id(image_url)
                if id_ in ids:
                    self.print_('duplicate: {}'.format(id_))
                    continue
                ids.add(id_)
                self.urls.append(image_url)
                self.filenames[image_url] = self.get_filename(image_url)

            if not cw.alive:
                break
            cw.setTitle('{}  {} - {}'.format(tr_('읽는 중...'), title, len(self.urls)))

            next_page = soup.find('a', attrs={'rel':'next'}, href=True)
            if not next_page:
                break
            else:
                url = urljoin(self.url, next_page['href'])

        self.title = title
Example #37
0
    def path_regex(self):
        """Return the regex for the path"""

        if self.locale_build:
            return self.build_list_regex

        return urljoin(self.build_list_regex, self.builds[self.build_index])
Example #38
0
def get_video(url, session, cw):
    print_ = get_print(cw)
    soup = downloader.read_soup(url, session=session)

    view = soup.find('div', id='player-container-fluid')
    fs = []
    for source in view.findAll('source'):
        src = urljoin(url, source.attrs['src'])
        res = re.find('([0-9]+)p', source.attrs['title'])
        res = int(res) if res else 0
        f = {'res': res, 'src': src}
        fs.append(f)
        print_(f)

    if not fs:
        raise Exception('No source')

    #4773
    res = max(get_resolution(), min(f['res'] for f in fs))
    print_(f'res: {res}')
    fs = sorted([f for f in fs if f['res'] <= res], key=lambda f: f['res'])
    f = fs[-1]
    print_(f'best: {f}')
    src_best = f['src']

    title = soup.find('h1').text.strip()
    id = soup.find('div', id='video').attrs['data-id']

    url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']

    #src_best = downloader.real_url(src_best)

    video = Video(src_best, url_thumb, url, title, id, session)
    return video
Example #39
0
def get_info(url, cw=None):
    print_ = get_print(cw)
    info = {'videos': []}
    html = downloader.read_html(url)
    soup = Soup(html)
    info['title'] = soup.find('h2', class_='videoCnt_title').text.strip()

    id_ = re.find(PATTERN_ID, url, err='no id')
    print_('id: {}'.format(id_))
    token = re.find(
        r'''window.FC2VideoObject.push\(\[['"]ae['"], *['"](.+?)['"]''',
        html,
        err='no token')
    print_('token: {}'.format(token))

    url_api = 'https://video.fc2.com/api/v3/videoplaylist/{}?sh=1&fs=0'.format(
        id_)
    hdr = {
        'X-FC2-Video-Access-Token': token,
    }
    data = downloader.read_json(url_api, url, headers=hdr)

    url_video = urljoin(
        url, data['playlist'].get('nq') or data['playlist']['sample'])
    url_thumb = soup.find('meta', {'property': 'og:image'})['content']
    video = Video(url_video, url_thumb, url, info['title'], id_)
    info['videos'].append(video)

    return info
Example #40
0
 def fix_url(cls, url):
     url = re.sub(r'\?page=[0-9]+&', '?', url)
     url = re.sub(r'&page=[0-9]+', '', url)
     pool = re.find('/pool/show/([0-9]+)', url)
     if pool is not None:
         url = urljoin(url, '/post?tags=pool%3A{}'.format(pool))
     return url
Example #41
0
    def get_build_info(self):
        """Defines additional build information"""

        # Internally we access builds via index
        url = urljoin(self.base_url, self.candidate_build_list_regex)
        self.logger.info('Retrieving list of candidate builds from %s' % url)

        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)
        if not parser.entries:
            message = 'Folder for specific candidate builds at %s has not' \
                'been found' % url
            raise errors.NotFoundError(message, url)

        self.show_matching_builds(parser.entries)
        self.builds = parser.entries
        self.build_index = len(parser.entries) - 1

        if self.build_number and \
                ('build%s' % self.build_number) in self.builds:
            self.builds = ['build%s' % self.build_number]
            self.build_index = 0
            self.logger.info('Selected build: build%s' % self.build_number)
        else:
            self.logger.info('Selected build: build%d' %
                             (self.build_index + 1))
Example #42
0
    def path_regex(self):
        """Return the regex for the path"""

        build_dir = 'try-%(PLATFORM)s%(DEBUG)s' % {
            'PLATFORM': self.platform_regex,
            'DEBUG': '-debug' if self.debug_build else ''}
        return urljoin(self.build_list_regex,
                       self.builds[self.build_index],
                       build_dir)
Example #43
0
 def buffer_sync(self):
     """bulk upsert of everything in self.buffer
     """
     data = protocol.serialize_web(
         [x.to_broadcast_json(include_hidden=True) for x in self.buffer])
     url = utils.urljoin(self.baseurl, self.docid + "/", 'bulkupsert')
     self.s.post(url, data=data)
     for m in self.buffer:
         m.set('created', True)
     self.buffer = []
Example #44
0
 def fetch(self, typename=None, id=None):
     if typename is None:
         url = utils.urljoin(self.baseurl, self.docid)
         data = self.s.get(url).content
         specs = self.ph.deserialize_web(data)
         models =  [ContinuumModel(
             x['type'], **x['attributes']) for x in specs]
         return models
     elif typename is not None and id is None:
         url = utils.urljoin(self.baseurl, self.docid +"/", typename)
         attrs = self.ph.deserialize_web(self.s.get(url).content)
         models = [ContinuumModel(typename, **x) for x in attrs]
         return models
     elif typename is not None and id is not None:
         url = utils.urljoin(self.baseurl, self.docid +"/", typename + "/", id)
         attr = self.ph.deserialize_web(self.s.get(url).content)
         if attr is None:
             return None
         model = ContinuumModel(typename, **attr)
         return model
Example #45
0
 def create(self, model, defer=False):
     if not model.get('docs'):
         model.set('docs', [self.docid])
     if defer:
         self.buffer.append(model)
     else:
         url = utils.urljoin(self.baseurl,
                             self.docid + "/",
                             model.typename)
         log.debug("create %s", url)
         self.s.post(url, data=self.ph.serialize_msg(model.to_json()))
     return model
Example #46
0
    def is_build_dir(self, dir):
        """Return whether or not the given dir contains a build."""

        url = urljoin(self.base_url, self.monthly_build_list_regex, dir)

        if self.application in MULTI_LOCALE_APPLICATIONS \
                and self.locale != 'multi':
            url = urljoin(url, self.locale)

        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)

        pattern = re.compile(self.binary_regex, re.IGNORECASE)
        for entry in parser.entries:
            try:
                pattern.match(entry).group()
                return True
            except:
                # No match, continue with next entry
                continue
        return False
Example #47
0
 def update(self, model, defer=False):
     model.set('doc', self.docid)        
     if defer:
         self.buffer.append(model)
     else:
         url = utils.urljoin(self.baseurl,
                             self.docid + "/",
                             model.typename + "/",
                             model.id +"/")
         log.debug("create %s", url)
         self.s.put(url, data=protocol.serialize_web(
             model.to_json(include_hidden=True)))
     return model
Example #48
0
    def is_build_dir(self, folder_name):
        """Return whether or not the given dir contains a build."""

        # Cannot move up to base scraper due to parser.entries call in
        # get_build_info_for_index (see below)
        url = '%s/' % urljoin(self.base_url, self.build_list_regex, folder_name)

        if self.application in APPLICATIONS_MULTI_LOCALE \
                and self.locale != 'multi':
            url = '%s/' % urljoin(url, self.locale)

        parser = self._create_directory_parser(url)

        pattern = re.compile(self.binary_regex, re.IGNORECASE)
        for entry in parser.entries:
            try:
                pattern.match(entry).group()
                return True
            except:
                # No match, continue with next entry
                continue
        return False
Example #49
0
 def create(self, model, defer=False):
     model.set('doc', self.docid)
     if defer:
         self.buffer.append(model)
     else:
         url = utils.urljoin(self.baseurl,
                             self.docid + "/",
                             model.typename +"/")
         log.debug("create %s", url)
         self.s.post(url, data=self.ph.serialize_msg(
             model.to_json(include_hidden=True)))
         model.set('created', True)
     return model
Example #50
0
    def __init__(self, directory, version, platform=None,
                 application='firefox', locale='en-US', extension=None,
                 authentication=None, retry_attempts=0, retry_delay=10.,
                 is_stub_installer=False, timeout=None, log_level='INFO',
                 base_url=BASE_URL):

        # Private properties for caching
        self._target = None
        self._binary = None

        self.directory = directory
        self.locale = locale
        self.platform = platform or self.detect_platform()
        self.version = version
        self.extension = extension or DEFAULT_FILE_EXTENSIONS[self.platform]
        self.authentication = authentication
        self.retry_attempts = retry_attempts
        self.retry_delay = retry_delay
        self.is_stub_installer = is_stub_installer
        self.timeout_download = timeout
        self.timeout_network = 60.

        self.logger = mozlog.getLogger(' ')
        self.logger.setLevel(getattr(mozlog, log_level.upper()))

        # build the base URL
        self.application = application
        self.base_url = urljoin(base_url, self.application)

        attempt = 0
        while True:
            attempt += 1
            try:
                self.get_build_info()
                break
            except (NotFoundError, requests.exceptions.RequestException), e:
                if self.retry_attempts > 0:
                    # Log only if multiple attempts are requested
                    self.logger.warning("Build not found: '%s'" % e.message)
                    self.logger.info('Will retry in %s seconds...' %
                                     (self.retry_delay))
                    time.sleep(self.retry_delay)
                    self.logger.info("Retrying... (attempt %s)" % attempt)

                if attempt >= self.retry_attempts:
                    if hasattr(e, 'response') and \
                            e.response.status_code == 404:
                        message = "Specified build has not been found"
                        raise NotFoundError(message, e.response.url)
                    else:
                        raise
Example #51
0
    def get_build_info_for_index(self, build_index=None):
        url = urljoin(self.base_url, self.build_list_regex)

        self.logger.info('Retrieving list of builds from %s' % url)
        parser = self._create_directory_parser(url)
        parser.entries = parser.filter('.*-%s$' % self.changeset)

        if not parser.entries:
            raise errors.NotFoundError('No builds have been found', url)

        self.show_matching_builds(parser.entries)

        self.logger.info('Selected build: %s' % parser.entries[0])

        return (parser.entries, 0)
Example #52
0
 def fetch(self, typename=None, id=None, include_hidden=False):
     query = urllib.urlencode({'include_hidden' : include_hidden})
     if typename is None:
         url = utils.urljoin(self.baseurl, self.docid +"/") + "?" + query
         data = self.s.get(url).content
         specs = protocol.deserialize_web(data)
         models =  [make_model(x['type'], client=self, **x['attributes'])\
                    for x in specs]
         return models
     elif typename is not None and id is None:
         url = utils.urljoin(self.baseurl, self.docid +"/", typename + "/")
         url += "?" + query
         attrs = protocol.deserialize_web(self.s.get(url).content)
         models = [make_model(typename, client=self, **x) for x in attrs]
         return models
     elif typename is not None and id is not None:
         url = utils.urljoin(self.baseurl, self.docid +"/",
                             typename + "/", id +"/")
         url += "?" + query            
         attr = protocol.deserialize_web(self.s.get(url).content)
         if attr is None:
             return None
         model = make_model(typename, client=self, **attr)
         return model
Example #53
0
    def get_build_info_for_index(self, build_index=None):
        url = urljoin(self.base_url, self.build_list_regex)

        self.logger.info('Retrieving list of builds from %s' % url)
        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)
        parser.entries = parser.filter('.*-%s$' % self.changeset)

        if not parser.entries:
            raise NotFoundError('No builds have been found', url)

        self.show_matching_builds(parser.entries)

        self.logger.info('Selected build: %s' % parser.entries[0])

        return (parser.entries, 0)
Example #54
0
    def get_build_info_for_date(self, date, build_index=None):
        url = urljoin(self.base_url, self.monthly_build_list_regex)
        has_time = date and date.time()

        self.logger.info('Retrieving list of builds from %s' % url)
        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)
        regex = r'%(DATE)s-(\d+-)+%(BRANCH)s%(L10N)s%(PLATFORM)s$' % {
            'DATE': date.strftime('%Y-%m-%d'),
            'BRANCH': self.branch,
            # ensure to select the correct subfolder for localized builds
            'L10N': '' if self.locale in ('en-US', 'multi') else '(-l10n)?',
            'PLATFORM': '' if self.application not in (
                        'fennec') else '-' + self.platform
        }
        parser.entries = parser.filter(regex)
        parser.entries = parser.filter(self.is_build_dir)

        if has_time:
            # If a time is included in the date, use it to determine the
            # build's index
            regex = r'.*%s.*' % date.strftime('%H-%M-%S')
            parser.entries = parser.filter(regex)

        if not parser.entries:
            date_format = '%Y-%m-%d-%H-%M-%S' if has_time else '%Y-%m-%d'
            message = 'Folder for builds on %s has not been found' % \
                self.date.strftime(date_format)
            raise errors.NotFoundError(message, url)

        # If no index has been given, set it to the last build of the day.
        self.show_matching_builds(parser.entries)
        # If no index has been given, set it to the last build of the day.
        if build_index is None:
            # Find the most recent non-empty entry.
            build_index = len(parser.entries)
            for build in reversed(parser.entries):
                build_index -= 1
                if not build_index or self.is_build_dir(build):
                    break
        self.logger.info('Selected build: %s' % parser.entries[build_index])

        return (parser.entries, build_index)
Example #55
0
    def get_build_info_for_version(self, version, build_index=None):
        url = urljoin(self.base_url, self.candidate_build_list_regex)

        self.logger.info('Retrieving list of candidate builds from %s' % url)
        parser = DirectoryParser(url, authentication=self.authentication,
                                 timeout=self.timeout_network)
        if not parser.entries:
            message = 'Folder for specific candidate builds at %s has not' \
                'been found' % url
            raise NotFoundError(message, url)

        self.show_matching_builds(parser.entries)

        # If no index has been given, set it to the last build of the given
        # version.
        if build_index is None:
            build_index = len(parser.entries) - 1

        return (parser.entries, build_index)
Example #56
0
    def url(self):
        """Return the URL of the build"""

        return urljoin(self.path, self.binary)
Example #57
0
    def path(self):
        """Return the path to the build"""

        return urljoin(self.base_url, self.path_regex)
Example #58
0
 def buffer_sync(self):
     data = self.ph.serialize_web([x.to_broadcast_json() \
                                   for x in self.buffer])
     url = utils.urljoin(self.baseurl, self.docid + "/", 'bulkupsert')
     self.s.post(url, data=data)
     self.buffer = []
Example #59
0
 def delete(self, typename, id):
     url = utils.urljoin(self.baseurl, self.docid +"/", typename + "/", id)
     self.s.delete(url)
Example #60
0
    def __init__(self, destination=None, platform=None,
                 application='firefox', locale=None, extension=None,
                 username=None, password=None,
                 retry_attempts=0, retry_delay=10.,
                 is_stub_installer=False, timeout=None,
                 log_level='INFO',
                 base_url=BASE_URL):

        # Private properties for caching
        self._filename = None
        self._binary = None

        self.destination = destination or os.getcwd()

        if not locale:
            if application in APPLICATIONS_MULTI_LOCALE:
                self.locale = 'multi'
            else:
                self.locale = 'en-US'
        else:
            self.locale = locale

        self.platform = platform or self.detect_platform()

        if (username, password) == (None, None):
            self.authentication = None
        else:
            self.authentication = (username, password)

        self.retry_attempts = retry_attempts
        self.retry_delay = retry_delay
        self.is_stub_installer = is_stub_installer
        self.timeout_download = timeout
        self.timeout_network = 60.

        logging.basicConfig(format=' %(levelname)s | %(message)s')
        self.logger = logging.getLogger(self.__module__)
        self.logger.setLevel(log_level)

        # build the base URL
        self.application = application
        self.base_url = urljoin(base_url, APPLICATIONS_TO_FTP_DIRECTORY.get(
            self.application, self.application))

        if extension:
            self.extension = extension
        else:
            if self.application in APPLICATIONS_MULTI_LOCALE and \
                    self.platform in ('win32', 'win64'):
                # builds for APPLICATIONS_MULTI_LOCALE only exist in zip
                self.extension = 'zip'
            else:
                self.extension = DEFAULT_FILE_EXTENSIONS[self.platform]

        attempt = 0
        while True:
            attempt += 1
            try:
                self.get_build_info()
                break
            except (errors.NotFoundError, requests.exceptions.RequestException), e:
                if self.retry_attempts > 0:
                    # Log only if multiple attempts are requested
                    self.logger.warning("Build not found: '%s'" % e.message)
                    self.logger.info('Will retry in %s seconds...' %
                                     (self.retry_delay))
                    time.sleep(self.retry_delay)
                    self.logger.info("Retrying... (attempt %s)" % attempt)

                if attempt >= self.retry_attempts:
                    if hasattr(e, 'response') and \
                            e.response.status_code == 404:
                        message = "Specified build has not been found"
                        raise errors.NotFoundError(message, e.response.url)
                    else:
                        raise