def site_a(site): if site[0:7] != 'http://': site = 'http://' + site opener = FancyURLopener() # criando o 'capturador' de paginas page = opener.open(site) # uma URL de teste html = page.read( ) # vai se conectar o servidor e capturar o html retornado # print html # se quiser ver o html bruto soup = BeautifulSoup( html, "lxml") # limpa as tags de html para deixar apenas o conteudo for script in soup(["script", "style"]): script.extract() # retirando os codigos em Javascript e CSS conteudo = soup.get_text() limpa = ['com', 'br', 'www', 'http'] site = re.sub(r'[^\w]', " ", site).split() novo_site = '' for a in site: if a not in limpa: novo_site += a site = novo_site file = open('site_w/' + site + '.txt', 'w') file.write( (conteudo.encode('utf-8') ).lower()) # imprime o texto limpo (sem tags html, Javascript ou CSS) lista_temas = { 'esporte': ('futebol', 'bola', 'jogador', 'esporte', 'flamengo', 'vasco', 'botafogo', 'fluminense', 'sport'), 'engenharia': ('engenharia', 'engenharias', 'engineer'), 'jogos': ('jogo', 'jogos', 'game', 'games') } tema(lista_temas, site)
def unicode_urlopen(url, accept_lang=None): """Returns a *Unicode* file-like object for non-local documents. Client must ensure that the URL points to non-binary data. Pass in an Accept-Language value to configure the FancyURLopener we use.""" opener = FancyURLopener() if accept_lang: opener.addheader("Accept-Language", accept_lang) # We want to convert the bytes file-like object returned by # urllib, which is bytes in both Python 2 and Python 3 # fortunately, and turn it into a Unicode file-like object # with a little help from our StringIO friend. page = opener.open(url) encoding = page.headers['content-type'] encoding = encoding.split('charset=') if len(encoding) > 1: encoding = encoding[-1] page = page.read().decode(encoding) else: page = page.read() encoding = meta_encoding(page) or 'utf8' page = page.decode(encoding) page = StringIO(page) return page
def ensureFileLocal(self, inFilePathOrURL): ''' Takes a file path or URL. Sets self.localFilePath to the same path if file is local, or if the file is remote but uncompressed. If a file is remote and compressed, retrieves the file into a local tmp file and returns that file name. In this case the flag self.deleteTempFile is set to True. :param inFilePathOrURL: file path or URL to file :type inFilePathOrURL: String ''' self.localFilePath = inFilePathOrURL self.deleteTempFile = False if self.compression == COMPRESSION_TYPE.NO_COMPRESSION: return # Got compressed file; is it local? parseResult = urlparse(inFilePathOrURL) if parseResult.scheme == 'file': self.localFilePath = parseResult.path return opener = FancyURLopener() # Throws IOError if URL does not exist: self.localFilePath = opener.retrieve(inFilePathOrURL)[0] self.deleteTempFile = True
def utOpen(file): # Open file if 'http' in file: opener = FancyURLopener() f = opener.open(file) else: f = open(file,'rb+') return f
def download(self, download_dir): dir_util.mkpath(download_dir) url = self.installer_url() print 'Downloading:', url web = FancyURLopener() web.retrieve(url, path.join(download_dir, path.basename(url)), display_progress)
def _get_sector_url(self, sector, length): start = sector * 2048 if self._buff: self._buff.close() opener = FancyURLopener() opener.http_error_206 = lambda *a, **k: None opener.addheader("Range", "bytes=%d-%d" % (start, start + length - 1)) self._buff = opener.open(self._url)
def fetch_genres(self): """ Grabs genres and returns tuple of genres """ self.genre_url = 'http://www.shoutcast.com/sbin/newxml.phtml' self.urlhandler = FancyURLopener() self.fd = self.urlhandler.open(self.genre_url) self.genre = self.fd.read() self.fd.close() return self.genre
def fetch_stations(self): """ Grabs the xml list of stations from the shoutcast server """ self.shout_url = 'http://www.shoutcast.com/sbin/newxml.phtml?genre=' + self.genre self.urlhandler = FancyURLopener() self.fd = self.urlhandler.open(self.shout_url) self.stations = self.fd.read() self.fd.close() return self.stations
def download(self, download_dir): result = path.join(download_dir, self.package_basename) if path.exists(result): print 'Found install', self.package_basename else: dir_util.mkpath(download_dir) url = "http://www.eiffel-loop.com/download/" + self.package_basename print 'Downloading:', url web = FancyURLopener() web.retrieve(url, result, display_progress) return result
def _download_build(self): build_url = URL_PREFIX if build_number.startswith(('4', '5', '6')): build_url += "/ob" else: build_url += "/sb" build_url += "/build/%s" % build_number logger.info("Build url is %s" % build_url) resource = json.loads(urllib2.urlopen(build_url).read()) deliverable_url = URL_PREFIX + "/%s" % resource[DELIVERABLE_URL_ATTR] infos = json.loads(urllib2.urlopen(deliverable_url).read()) for info in infos[LIST_ATTR]: if info[DOWNLOAD_URL_ATTR].find("VMware-viewagent-x86_64") > 0: FancyURLopener(proxies={}).retrieve(info[DOWNLOAD_URL_ATTR], INSTALL_FILE) logger.info('Download %s to %s SUCCEED' % (info[DOWNLOAD_URL_ATTR], INSTALL_FILE))
def download_package(pkg_name, pkg_version): file_name, path, hash_algorithm, expected_digest = get_package_info(pkg_name, pkg_version) if not file_name: return False if os.path.isfile(file_name) and check_digest(file_name, hash_algorithm, expected_digest): print('File with matching digest already exists, skipping {0}'.format(file_name)) return True downloader = FancyURLopener() pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path) print('Downloading {0} from {1}'.format(file_name, pkg_url)) downloader.retrieve(pkg_url, file_name) if check_digest(file_name, hash_algorithm, expected_digest): return True else: print('Hash digest check failed in file {0}.'.format(file_name)) return False
def retrieveCatalog(): try: cache = SimpleCache() catalog = cache.get(ADDON_NAME + '.catalog') if catalog: log("using cached catalog") if not catalog: log("downloading catalog") opener = FancyURLopener() f = opener.open(url) catalog = json.load(f) cache.set(ADDON_NAME + '.catalog', catalog, expiration=datetime.timedelta(hours=12)) return catalog except Exception as e: log("error retrieving catalog - " + str(e), xbmc.LOGERROR) xbmcgui.Dialog().notification(ADDON_NAME, LANGUAGE(30003), ICON, 4000) xbmc.executebuiltin('Action(PreviousMenu)') sys.exit(0)
def __install_grinder(self, grinder_path): """ Installs Grinder. Grinder version and download link may be set in config: "download-link":"http://domain/resource-{version}.zip" "version":"1.2.3" """ dest = os.path.dirname( os.path.dirname(os.path.expanduser(grinder_path))) if not dest: dest = os.path.expanduser("~/grinder-taurus") dest = os.path.abspath(dest) grinder_full_path = os.path.join(dest, "lib", "grinder.jar") try: self.__grinder(grinder_full_path) return grinder_full_path except CalledProcessError: self.log.info("Will try to install grinder into %s", dest) downloader = FancyURLopener() grinder_zip_path = self.engine.create_artifact("grinder-dist", ".zip") version = self.settings.get("version", GrinderExecutor.VERSION) download_link = self.settings.get("download-link", GrinderExecutor.DOWNLOAD_LINK) download_link = download_link.format(version=version) self.log.info("Downloading %s", download_link) try: downloader.retrieve(download_link, grinder_zip_path, download_progress_hook) except BaseException as e: self.log.error("Error while downloading %s", download_link) raise e self.log.info("Unzipping %s", grinder_zip_path) unzip(grinder_zip_path, dest, 'grinder-' + version) os.remove(grinder_zip_path) self.log.info("Installed grinder successfully") return grinder_full_path
def __install_gatling(self, gatling_path): """ Installs Gatling. Gatling version and download link may be set in config: "download-link":"http://domain/resource-{version}.zip" "version":"1.2.3" """ dest = os.path.dirname( os.path.dirname(os.path.expanduser(gatling_path))) # ../.. dest = os.path.abspath(dest) try: self.__gatling(gatling_path) return gatling_path except OSError: self.log.info("Will try to install Gatling into %s", dest) # download gatling downloader = FancyURLopener() gatling_zip_path = self.engine.create_artifact("gatling-dist", ".zip") version = self.settings.get("version", GatlingExecutor.VERSION) download_link = self.settings.get("download-link", GatlingExecutor.DOWNLOAD_LINK) download_link = download_link.format(version=version) self.log.info("Downloading %s", download_link) # TODO: check archive checksum/hash before unzip and run try: downloader.retrieve(download_link, gatling_zip_path, download_progress_hook) except BaseException as e: self.log.error("Error while downloading %s", download_link) raise e self.log.info("Unzipping %s", gatling_zip_path) unzip(gatling_zip_path, dest, 'gatling-charts-highcharts-bundle-' + version) os.remove(gatling_zip_path) os.chmod(os.path.expanduser(gatling_path), 0o755) self.log.info("Installed Gatling successfully")
def fetchURL(url, file='', params=None, headers={}, isBinary=False, encodeURL=True): log("> bbbLib.fetchURL() %s isBinary=%s encodeURL=%s" % (url, isBinary, encodeURL)) if encodeURL: safe_url = quote_plus(url,'/:&?=+#@') else: safe_url = url success = False data = None if not file: # create temp file file = xbmc.translatePath( "special://temp/temp.html" ) # remove destination file if exists already deleteFile(file) # fetch from url try: opener = FancyURLopener() # add headers if supplied if not headers.has_key('User-Agent') and not headers.has_key('User-agent'): headers['User-Agent'] = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' for name, value in headers.items(): opener.addheader(name, value) fn, resp = opener.retrieve(safe_url, file, data=params) # print fn, resp content_type = resp.get("Content-Type",'').lower() # fail if expecting an image but not correct type returned if isBinary and (find(content_type,"text") != -1): raise "Not Binary" opener.close() del opener urlcleanup() except IOError, errobj: ErrorCode(errobj)
def _getlinesfromurl(self, url): err = 0 strerr = '' # Retry URL download a few times. for count in range(self.retries): if count != 0: time.sleep(self.retrysecs) try: opener = FancyURLopener() f = opener.open(url, data='user_name=%s&password=%s&login=Login' % (self.username, self.password)) rc = 0 if 'www-authenticate' in f.headers: rc = 1 strerr = 'Authentication is required to access %s' % url break except IOError, (_err, _strerr): rc = 1 print url print _strerr (err, strerr) = (_err, _strerr)
def download(self): bean = self.bean update = self.update if not bean or not bean.path: return None opener = FancyURLopener() remote = opener.open(bean.path) remote_size = 0 if "Content-Length" in remote.headers: remote_size = int(remote.headers["Content-Length"]) bean.size = size2text(remote_size) block_size = 4096 block_count = 0 ext = get_file_extension(bean.path) path = FC().online_save_to_folder if not os.path.isdir(path): os.makedirs(path) if bean.save_to: to_file = os.path.join(bean.save_to, bean.text + ".mp3") else: to_file = get_bean_download_path(bean, FC().online_save_to_folder) if not os.path.exists(os.path.dirname(to_file)): os.makedirs(os.path.dirname(to_file)) to_file_tmp = to_file + ".tmp" if os.path.exists(to_file_tmp): bean.status = DOWNLOAD_STATUS_INACTIVE bean.to_file = to_file update(bean) return None if os.path.exists(to_file): bean.status = DOWNLOAD_STATUS_COMPLETED bean.to_file = to_file update(bean) return None bean.save_to = to_file with file(to_file_tmp, "wb") as tmp_file: data = True """begin download""" self.bean.status = DOWNLOAD_STATUS_DOWNLOADING self.bean.path = to_file self.update(self.bean) while data: data = remote.read(block_size) if data: block_count += 1 tmp_file.write(data) #time.sleep(0.1) persent = block_count * block_size * 100.0 / remote_size if block_count % 50 == 0: bean.persent = persent update(bean) time.sleep(0.5) """update file info on finish""" logging.debug("rename %s - %s" % (to_file_tmp, to_file)) os.rename(to_file_tmp, to_file) bean.status = DOWNLOAD_STATUS_COMPLETED bean.to_file = to_file bean.persent = 100 update(bean)
def urlopen_custom(req, rawserver): global _urlopener if not _urlopener: opener = FancyURLopener() _urlopener = opener #remove User-Agent del _urlopener.addheaders[:] if not isinstance(req, str): #for header in r.headers: # _urlopener.addheaders.append((header, r.headers[header])) #return _urlopener.open(r.get_full_url(), r.data) # All this has to be done manually, since httplib and urllib 1 and 2 # add headers to the request that some routers do not accept. # A minimal, functional request includes the headers: # Content-Length # Soapaction # I have found the following to be specifically disallowed: # User-agent # Connection # Accept-encoding s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(req.get_full_url()) if not scheme.startswith("http"): raise ValueError("UPnP URL scheme is not http: " + req.get_full_url()) if len(path) == 0: path = '/' if netloc.count(":") > 0: host, port = netloc.split(':', 1) try: port = int(port) except: raise ValueError("UPnP URL port is not int: " + req.get_full_url()) else: host = netloc port = 80 header_str = '' data = '' method = '' header_str = " " + path + " HTTP/1.0\r\n" if req.has_data(): method = 'POST' header_str = method + header_str header_str += "Content-Length: " + str(len(req.data)) + "\r\n" data = req.data + "\r\n" else: method = 'GET' header_str = method + header_str header_str += "Host: " + host + ":" + str(port) + "\r\n" for header in req.headers: header_str += header + ": " + str(req.headers[header]) + "\r\n" header_str += "\r\n" data = header_str + data try: rawserver.add_pending_connection(host) s.connect((host, port)) finally: rawserver.remove_pending_connection(host) s.send(data) r = HTTPResponse(s, method=method) r.begin() r.recv = r.read fp = socket._fileobject(r) resp = addinfourl(fp, r.msg, req.get_full_url()) resp.code = r.status resp.msg = r.reason return resp return _urlopener.open(req)
from rovin.belex.BelexParser import BelexParser from urllib import FancyURLopener from os import path url = "http://www.ejustice.just.fgov.be/cgi_loi/loi_a1.pl?language=nl&table_name=wet&la=N&cn=1994021730&&caller=list&N&fromtab=wet" filename = "constitution-nl.html" if not path.isfile(filename): print "Downloading", url downloader = FancyURLopener() downloader.retrieve(url, filename) f = open(filename, 'r') html = f.read() parser = BelexParser() parser.feed(html) for article in parser.all_articles(): print " <<<< " print article.number print "BODY:" + article.body print " >>>> "
def get_poster(self, item): """Returns file path to the new poster""" from movie import Progress, Retriever file_to_copy = tempfile.mktemp(suffix=self.widgets['movie']['number'].get_text(), \ dir=self.locations['temp']) file_to_copy += ".jpg" canceled = False try: progress = Progress(self.widgets['window'], _("Fetching poster"), _("Wait a moment")) retriever = Retriever(item.LargeImage.URL, self.widgets['window'], progress, file_to_copy) retriever.start() while retriever.isAlive(): progress.pulse() if progress.status: canceled = True while gtk.events_pending(): gtk.main_iteration() progress.close() urlcleanup() except: canceled = True gutils.warning(_("Sorry. A connection error has occurred.")) try: os.remove(file_to_copy) except: log.error("no permission for %s" % file_to_copy) if not canceled: if os.path.isfile(file_to_copy): im = None try: im = Image.open(file_to_copy) except IOError: log.warn("failed to identify %s" % file_to_copy) if im and im.size == (1, 1): url = FancyURLopener().open("http://www.amazon.com/gp/product/images/%s" % item.ASIN).read() if url.find('no-img-sm._V47056216_.gif') > 0: log.warn('No image available') gutils.warning(_("Sorry. This movie is listed but has no poster available at Amazon.com.")) return False url = gutils.after(url, 'id="imageViewerDiv"><img src="') url = gutils.before(url, '" id="prodImage"') urlretrieve(url, file_to_copy) try: im = Image.open(file_to_copy) except IOError: log.warn("failed to identify %s", file_to_copy) if not im: # something wrong with the image, give some feedback to the user log.warn('No image available') gutils.warning(_("Sorry. This movie is listed but has no poster available at Amazon.com.")) return False if im.mode != 'RGB': # convert GIFs im = im.convert('RGB') im.save(file_to_copy, 'JPEG') # set to None because the file is locked otherwise (os.remove throws an exception) im = None handler = self.widgets['big_poster'].set_from_file(file_to_copy) self.widgets['poster_window'].show() self.widgets['poster_window'].move(0, 0) if gutils.question(_("Do you want to use this poster instead?"), self.widgets['window']): return file_to_copy else: log.info("Reverting to previous poster and deleting new one from disk.") try: os.remove(file_to_copy) except: log.error('cannot remove %s', file_to_copy) self.widgets['poster_window'].hide() else: gutils.warning(_("Sorry. This movie is listed but has no poster available at Amazon.com.")) else: # cleanup temporary files after canceling the download if os.path.isfile(file_to_copy): try: os.remove(file_to_copy) except: log.error('cannot remove %s', file_to_copy)
def _fetch_sources(self, workspace, package_dir, package_dest_dir): def checkout(self, source_url, cache_dir, workspace_dir): if not os.path.exists(cache_dir): print 'No cache detected. Cloning a fresh cache' self.sh('%' + '{git} clone --mirror "%s" "%s"' % (source_url, cache_dir)) else: print 'Updating existing cache' self.cd(cache_dir) self.sh('%{git} fetch --all --prune') if not os.path.exists(workspace_dir): print 'No workspace checkout detected. Cloning a fresh workspace checkout from the cache' self.sh('%' + '{git} clone --local --shared "%s" "%s"' % (cache_dir, workspace_dir)) self.cd(workspace_dir) else: print 'Updating existing workspace checkout' self.cd(workspace_dir) self.sh('%{git} clean -xffd') self.sh('%{git} reset --hard') self.sh('%{git} fetch --all --prune') if self.revision != None: self.sh('%' + '{git} checkout %s' % self.revision) elif self.git_branch != None: self.sh('%' + '{git} checkout origin/%s' % self.git_branch) else: self.sh('%{git} checkout origin/master') def get_local_filename(source): return source if os.path.isfile(source) else os.path.join( package_dest_dir, os.path.basename(source)) def get_cache_name(name): if self.organization is None: return self.name else: return self.organization + "+" + name if self.sources is None: return if not os.path.exists(package_dest_dir): os.mkdir(package_dest_dir) local_sources = [] for source in self.sources: local_source = os.path.join(package_dir, source) local_source_file = os.path.basename(local_source) local_dest_file = get_local_filename(local_source) local_sources.append(local_dest_file) if os.path.isfile(local_source): if filecmp.cmp(local_source, local_dest_file): log(1, 'using cached source: %s' % local_dest_file) else: log(1, 'copying local source: %s' % local_source_file) shutil.copy2(local_source, local_dest_file) elif source.startswith(('http://', 'https://', 'ftp://')): if os.path.isfile(local_dest_file): try: self.extract_archive(local_dest_file, True) log(1, 'using cached source: %s' % local_dest_file) except: log(1, 'local cache is corrupt for: %s' % local_dest_file) os.remove(local_dest_file) if not os.path.isfile(local_dest_file): log(1, 'downloading remote source: %s' % source) filename, message = FancyURLopener().retrieve( source, local_dest_file) elif source.startswith( ('git://', 'file://', 'ssh://')) or source.endswith('.git'): log(1, 'cloning or updating git repository: %s' % source) local_name = os.path.splitext(os.path.basename(source))[0] local_dest_file = os.path.join( package_dest_dir, '%s.gitmirror' % (get_cache_name(local_name))) local_sources.pop() local_sources.append(local_dest_file) working_dir = os.getcwd() try: checkout(self, source, local_dest_file, workspace) except Exception as e: if os.path.exists(local_dest_file): print 'Deleting ' + local_dest_file + ' cache due to git error' shutil.rmtree(local_dest_file, ignore_errors=True) if os.path.exists(workspace): print 'Deleting ' + workspace + ' cache due to git error' shutil.rmtree(workspace, ignore_errors=True) # Explicitly reset the working dir to a known directory which has not been deleted # 'git clone' does not work if you are in a directory which has been deleted os.chdir(working_dir) checkout(self, source, local_dest_file, workspace) finally: os.chdir(workspace) else: raise Exception('missing source: %s' % source) self.sources = local_sources
def main(argv=None): # {{{ # Separates the URL into a directory and the file or pattern based on the # last appearance of '/'. if len(sys.argv) > 1: pivot = sys.argv[1].rfind("/") url = (sys.argv[1])[:pivot] pivot += 1 find = (sys.argv[1])[pivot:] else: print "******************************************************************************************************************************" print "* Invalid input! *" print "* *" print "* Try: 'DownloadExternalPackage.py url [localFile]' *" print "* *" print "* Where 'URL' is the URL with an explicit package name or the URL followed by the truncated package name. And 'localFile' is *" print "* the file name (including extension) that you would like to save as. *" print "* *" print "* Examples: *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/petsc-2.3.2-p3.tar.gz' 'petsc-2.3.2-p3.tar.gz' *" print "* *" print "* This is the old style and the safest way to download a package. *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/libtool' 'libtool.tar.gz' *" print "* *" print "* This is the new style. For packages like 'Libtool', which we never expect to be using multiple versions, this will *" print "* download the most recent version and save it as the generic 'libtool.tar.gz'. *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/gsl-1.' 'gsl-1.15.tar.gz' *" print "* *" print "* This is the new style. This is a demonstration of how this script can be used to disambiguate a package name if there *" print "* are more than once package matching 'gsl-'. *" print "* *" print "* DownloadExternalPackage.py 'http://issm.jpl.nasa.gov/files/externalpackages/libtool' *" print "* *" print "* This is the new style. This will download a package with 'libtool' as a prefix and save it as its canonical name. *" print "* *" print "* *" print "******************************************************************************************************************************" if len(sys.argv) > 2: localFile = sys.argv[2] print "Downloaded file will be saved as: " + localFile else: localFile = None print "Downloaded file will saved with the same file name." print "Looking for: " + find # As an extra precaution, if no extension is given for a particular package # such as '.../libtool', then ensure that files found are of appropriate # file extensions. # # WARNING: The external packages directory includes executable binaries with # '.exe' extensions. As such, '.exe' is an acceptable suffix, but this is # inherently dangerous since this script can be used to download from any # valid website. Furthermore, if an individual attempts a "man-in-the-middle" # attack, then the user would be capable of downloading executables from # an untrusted source. pattern = find + "[\w.-]*(\.tar\.gz|tar\.gz2|tgz|zip|exe)?" parser = MyHTMLParser(pattern) # Creates a 'FancyURL' which allows the script to fail gracefully by catching # HTTP error codes 30X and several 40X(where 'X' is a natural number). urlObject = FancyURLopener() obj = urlObject.open(url) parser.feed(obj.read()) # If a file pattern was used to describe the file that should be downloaded, # then there is the potential for multiple file matches. Currently, the script # will detect this ambiguity and print out all the matches, while informing # the user that he must refine his search. # # TODO: Prompt the user to select from a list his/her preferred target. if len(parser.targets) > 1: print "Could not resolve your download due to the number of hits." print "Refine your search." for i in parser.targets: print i elif len(parser.targets) == 1: print "Found: " + parser.targets[0] url += "/" + parser.targets[0] if localFile is None: if os.path.exists(parser.targets[0]): print "File " + parser.targets[ 0] + " already exists and will not be downloaded..." else: urllib.urlretrieve(url, parser.targets[0]) print "File saved as: " + parser.targets[0] else: if os.path.exists(localFile): print "File " + localFile + " already exists and will not be downloaded..." else: if parser.targets[0] == localFile: print "File found and destination match." elif parser.matcher.match(localFile) != "None": print "File found matches destination pattern." else: print "WARNING: the file found \'" + parser.targets[ 0] + "\' does not match \'" + localFile + "\'" print "Ensure the downloaded version is suitable." urllib.urlretrieve(url, localFile) print "File saved as: " + localFile else: print "No matches found!" obj.close()