def test_get_sanitized_local_url(self): self.assertEqual(uu.get_sanitized_url('/foo/bar'), 'file:///foo/bar') self.assertEqual(uu.get_sanitized_url('file:///foo/bar'), 'file:///foo/bar') self.assertEqual(uu.get_sanitized_url('http://foo/bar'), 'http://foo/bar') self.assertEqual(uu.get_sanitized_url('foo/bar'), 'foo/bar')
def __init__(self, input_url, output_url): input_url = get_sanitized_url(input_url) output_url = get_sanitized_url(output_url) if input_url.endswith('/'): scope_url = input_url else: (scope_url, _) = os.path.split(input_url) scope_url += '/' GenericSyncer.logger.debug('SCOPE: {0}'.format(scope_url)) super(SingleURLSyncer, self).__init__(input_urls=input_url, output_url=output_url, scope=[scope_url])
def __init__(self, input_url, output_url): input_url = get_sanitized_url(input_url) output_url = get_sanitized_url(output_url) if input_url.endswith('/'): scope_url = input_url else: (scope_url, _) = os.path.split(input_url) scope_url += '/' GenericSyncer.logger.debug('SCOPE: {0}'.format(scope_url)) super(SingleURLSyncer, self).__init__( input_urls=input_url, output_url=output_url, scope=[scope_url])
def __init__(self, input_url, output_url): input_url = get_sanitized_url(input_url) output_url = get_sanitized_url(output_url) scope = [] base = os.path.split(input_url)[0] scope.append(base + '/gettingStarted.html') scope.append(base + '/keyboard-howto.html') scope.append(base + '/table-howto.html') scope.append(base + '/frame-howto.html') scope.append(base + '/window-howto.html') scope.append(base + '/javascript-howto.html') scope.append(base + '/activeX-howto.html') scope.append(base + '/logging.html') scope.append(base + '/faq.html') scope.append(base + '/index.html') super(HtmlUnitSyncer, self).__init__(input_urls=input_url, output_url=output_url, scope=scope)
def __init__(self, input_url, output_url): input_url = get_sanitized_url(input_url) output_url = get_sanitized_url(output_url) scope = [] base = os.path.split(input_url)[0] scope.append(base + '/gettingStarted.html') scope.append(base + '/keyboard-howto.html') scope.append(base + '/table-howto.html') scope.append(base + '/frame-howto.html') scope.append(base + '/window-howto.html') scope.append(base + '/javascript-howto.html') scope.append(base + '/activeX-howto.html') scope.append(base + '/logging.html') scope.append(base + '/faq.html') scope.append(base + '/index.html') super(HtmlUnitSyncer, self).__init__( input_urls=input_url, output_url=output_url, scope=scope)
def process_page_links(self, tree, local_url, url): link_tags = self.links(tree) links = [] for link_tag in link_tags: attributes = link_tag.attrib href = '' if 'href' in attributes: href = attributes['href'] link_url = get_url_without_hash(urlparse.urljoin(url, href)) local_url_to = get_local_url(self.output_url, link_url) local_url_to = get_sanitized_url(local_url_to) link = DocumentLink(link_url, local_url_to) links.append(link) else: continue return links
def download_content(file_from_path, force=False, real_browser=False): url = get_sanitized_url(file_from_path) try: if not real_browser: file_from = get_file_from(url) else: file_from = get_file_from_real_browser(url) logger.info('Downloading {0}'.format(url)) content = file_from.read() file_from.close() encoding = get_encoding(content) content = unicode(content, encoding) return (content, encoding) except Exception: logger.exception('Error while downloading a file: {0}'.format(url)) raise RecoDocError('Error downloading {0}'.format(url))
def download_content(file_from_path, force=False, real_browser=False): url = get_sanitized_url(file_from_path) try: if not real_browser: file_from = get_file_from(url) else: file_from = get_file_from_real_browser(url) logger.info('Downloading {0}'.format(url)) content = file_from.read() file_from.close() encoding = get_encoding(content) content = unicode(content, encoding) return (content, encoding) except Exception: logger.exception('Error while downloading a file: {0}'.format( url)) raise RecoDocError('Error downloading {0}'.format(url))
def download_file(file_from_path, file_to_path, force=False, binary=False, real_browser=False): url = get_sanitized_url(file_from_path) if os.path.exists(file_to_path) and \ os.path.getsize(file_to_path) > 0 and \ not force: logger.info('Skipped downloading {0} because it already exists in ' '{1}'.format(url, file_to_path)) return try: if not real_browser: file_from = get_file_from(url) else: file_from = get_file_from_real_browser(url) if binary: file_to = open(file_to_path, 'wb') else: file_to = codecs.open(file_to_path, 'w', encoding='utf8') logger.info('Downloading {0} to {1} in mode binary? {2}'.format( url, file_to_path, binary)) if not binary: content = file_from.read() encoding = get_encoding(content) content = unicode(content, encoding) file_to.write(content) file_from.close() file_to.close() else: shutil.copyfileobj(file_from, file_to) file_from.close() file_to.close() except Exception: logger.info('Error while downloading a file: {0}'.format(url)) if os.path.exists(file_to_path): os.remove(file_to_path) raise RecoDocError('Error downloading {0}'.format(url))
def download_file(file_from_path, file_to_path, force=False, binary=False, real_browser=False): url = get_sanitized_url(file_from_path) if os.path.exists(file_to_path) and \ os.path.getsize(file_to_path) > 0 and \ not force: logger.info('Skipped downloading {0} because it already exists in ' '{1}'.format(url, file_to_path)) return try: if not real_browser: file_from = get_file_from(url) else: file_from = get_file_from_real_browser(url) if binary: file_to = open(file_to_path, 'wb') else: file_to = codecs.open(file_to_path, 'w', encoding='utf8') logger.info('Downloading {0} to {1} in mode binary? {2}'.format(url, file_to_path, binary)) if not binary: content = file_from.read() encoding = get_encoding(content) content = unicode(content, encoding) file_to.write(content) file_from.close() file_to.close() else: shutil.copyfileobj(file_from, file_to) file_from.close() file_to.close() except Exception: logger.info('Error while downloading a file: {0}'.format( url)) if os.path.exists(file_to_path): os.remove(file_to_path) raise RecoDocError('Error downloading {0}'.format(url))
def test_get_sanitized_local_url(self): self.assertEqual(uu.get_sanitized_url("/foo/bar"), "file:///foo/bar") self.assertEqual(uu.get_sanitized_url("file:///foo/bar"), "file:///foo/bar") self.assertEqual(uu.get_sanitized_url("http://foo/bar"), "http://foo/bar") self.assertEqual(uu.get_sanitized_url("foo/bar"), "foo/bar")