def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert(method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv # Log us in automagically! httpd = self.session.config.http_worker.httpd global HACKS_SESSION_ID if HACKS_SESSION_ID is None: HACKS_SESSION_ID = httpd.make_session_id(None) mailpile.auth.SetLoggedIn(None, user='******', session_id=HACKS_SESSION_ID) cookie = httpd.session_cookie try: uo = URLopener() uo.addheader('Cookie', '%s=%s' % (cookie, HACKS_SESSION_ID)) with TcpConnBroker().context(need=[TcpConnBroker.OUTGOING_HTTP]): if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def read_component_sitemap(self, sitemapindex_uri, sitemap_uri, sitemap, sitemapindex_is_file): """Read a component sitemap of a Resource List with index Each component must be a sitemap with the """ if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) self.logger.info("Mapped %s to local file %s" % (remote_uri, sitemap_uri)) else: # The individual sitemaps should be at a URL (scheme/server/path) # that the sitemapindex URL can speak authoritatively about if (self.check_url_authority and not UrlAuthority(sitemapindex_uri).has_authority_over(sitemap_uri)): raise ListBaseIndexError("The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (sitemapindex_uri,sitemap_uri)) try: fh = URLopener().open(sitemap_uri) self.num_files += 1 except IOError as e: raise ListBaseIndexError("Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri,sitemapindex_uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info( "Reading sitemap from %s (%d bytes)" % (sitemap_uri,self.content_length) ) component = sitemap.parse_xml( fh=fh, sitemapindex=False ) # Copy resources into self, check any metadata for r in component: self.resources.add(r)
def getRetriever(scheme): """ Get the right retriever function depending on the scheme. If scheme is 'http' return urllib.urlretrieve, else if the scheme is https create a URLOpener with certificates taken from the X509_USER_PROXY variable. If certificates are not available return urllib.urlretrieve as for the http case. """ if 'X509_USER_PROXY' in os.environ and os.path.isfile( os.environ['X509_USER_PROXY']): certfile = os.environ['X509_USER_PROXY'] else: if scheme == 'https': print( "User proxy not found. Trying to retrieve the file without using certificates" ) certfile = None if scheme == 'http' or not certfile: retriever = urllib.urlretrieve else: print("Using %s as X509 certificate" % certfile) op = URLopener(None, key_file=certfile, cert_file=certfile) op.addheader('Accept', 'application/octet-stream') retriever = op.retrieve return retriever
def getcif(target): """ Get all ICSD cif files listed in target file. The target file should contain tag like '# BCC'. """ matgenIDs=getMatgenIDs() if not os.path.isdir('./ciffiles'): os.makedirs('./ciffiles') with open(target,'r') as f: st=f.readline() t1=time.time() while st: if st[0]=='#': tg=st.split()[-1] st=f.readline() t2=time.time() print "time for the %s = %2.2f sec" %(tg,t2-t1) t1=time.time() continue st=st.strip() ind=getID(st) if ind in matgenIDs: continue #skip matgen compounds URL=prefix+tg+'/'+st+'/'+st+'.cif' testfile=URLopener() try: testfile.retrieve(URL,'ciffiles/'+st) except: print "Error: ",URL st=f.readline()
def unshortenurl(short): from urllib import URLopener opener = URLopener() try: opener.open(short) except IOError, e: f = e
class check_the_mangas(): def __init__(self,manga_name): self.manga_name = manga_name self.myfile = open(configuration.DATA_FILE,'r').read() self.manga_oldnumber = self.get_number() self.manga_nownumber = self.manga_oldnumber self.manga_olddate = self.get_date () self.nowdate = self.today_date() self.br = URLopener() def get_number(self): return re.findall(self.manga_name+':([0-9]+):',self.myfile)[0] def get_date(self): return re.findall(self.manga_name+":"+str(self.manga_oldnumber)+':(.*)\n',self.myfile)[0] def today_date(self): return subprocess.check_output(["date","+%a-%b-%e"]).replace("\n","") #return 1 if the connection is working def test_connection(self): try: response = self.br.open(configuration.WEBSITE_TO_CHECK_CONNECTION).read() if configuration.KEYWORD in response: return 1 else: return 0 except: print "manga connection" return 0 def exec_cmd(self): pid = os.fork() os.umask(0) os.system(configuration.MANGA_NEW_CMD.replace("MANGA",self.manga_name)) def run(self): if( self.test_connection() ): last_chapter = False try: while(last_chapter==False): to_open = "http://www.mangareader.net/" + self.manga_name + "/" + str( int(self.manga_nownumber)+1 ) response = self.br.open( to_open).read() if "is not released yet" in response or "not published yet" in response or response == "": last_chapter = True if self.manga_name + ":" + str(self.manga_nownumber) not in open(configuration.DATA_FILE, "r").read(): Thread(target=self.exec_cmd).start() configuration.backup() open(configuration.DATA_FILE,'w').write(open(configuration.DATA_FILE+".bak", "r").read().replace(self.manga_name+":"+str(self.manga_oldnumber)+":"+ self.manga_olddate, self.manga_name+":"+str(self.manga_nownumber)+":"+self.nowdate)) else: print "not last chapter" self.manga_nownumber = str( int(self.manga_nownumber)+1 ) except Exception,e : print e print "manga run" if "is not released yet. If you liked" in response: if self.manga_name + ":" + str(self.manga_nownumber) not in open(configuration.DATA_FILE, "r").read(): configuration.backup() open(configuration.DATA_FILE,'w').write(open(configuration.DATA_FILE+".bak", "r").read().replace(self.manga_name+":"+str(self.manga_oldnumber)+":"+ self.manga_olddate, self.manga_name+":"+str(self.manga_nownumber)+":"+self.nowdate)) pass
def test_basic_startup(): import thread # XXX: how to do this without threads? httpd = server.HTTPServer(('127.0.0.1', 21210), Handler) thread.start_new_thread(httpd.serve_forever, ()) assert URLopener().open("http://127.0.0.1:21210/index").read() == "xxx" assert URLopener().open("http://127.0.0.1:21210/").read() == "xxx"
def handle_starttag(self, tag, attrs): #tmpoutput = "" count = 0 global bDoWork #self.output = "" # Only parse the 'anchor' tag. if tag == "a": # Check the list of defined attributes. for name, value in attrs: # If href is defined, print it. if name == "href": if value[len(value) - 3:len(value)] == "jpg": #print value if not "http://" in value and bDoWork == True: bDoWork = False tmpoutput = value #print "Val: " + value imgurl = 'http://apod.nasa.gov/apod/' + tmpoutput #print "IMGURL: " + imgurl filename = imgurl.split('/')[-1] #print "FileName: " + filename if (not os.path.isfile(filename)) and ( 'apod.nasa.gov' in imgurl): print "Downloading: " + filename image = URLopener() image.retrieve(imgurl, filename) sleep(lWaitTime) elif (os.path.isfile(filename)): print "Verified: " + filename break
class check_the_mangas(): def __init__(self,manga_name, db_conn): self.db_conn = db_conn self.manga_name = manga_name self.manga_oldnumber = sqlite_manager.get_manga_chapter( db_conn, manga_name) self.manga_nownumber = self.manga_oldnumber self.manga_olddate = sqlite_manager.get_manga_date( db_conn, manga_name) self.nowdate = self.today_date() self.br = URLopener() def today_date(self): return subprocess.check_output(["date","+%a-%b-%e"]).replace("\n","") #return 1 if the connection is working def test_connection(self): try: response = self.br.open(configuration.WEBSITE_TO_CHECK_CONNECTION).read() if configuration.KEYWORD in response: return 1 else: return 0 except: print "manga connection" return 0 def exec_cmd(self): pid = os.fork() os.umask(0) os.system(configuration.MANGA_NEW_CMD.replace("MANGA",self.manga_name)) def run(self): if( self.test_connection() ): last_chapter = False try: while(last_chapter==False): to_open = "http://www.mangareader.net/" + self.manga_name + "/" + str( int(self.manga_nownumber)+1 ) response = self.br.open( to_open).read() if "is not released yet" in response or "not published yet" in response or response == "": last_chapter = True if self.manga_nownumber != sqlite_manager.get_manga_chapter(self.db_conn, self.manga_name): print self.manga_name+":"+self.manga_nownumber+":"+self.nowdate sqlite_manager.update_manga(self.db_conn, self.manga_name, self.manga_nownumber, self.nowdate) else: self.manga_nownumber = str( int(self.manga_nownumber)+1 ) except Exception,e : if "is not released yet. If you liked" in response: if self.manga_nownumber != sqlite_manager.get_manga_chapter(self.db_conn,self.manga_name): print self.manga_name+":"+self.manga_nownumber+":"+self.nowdate sqlite_manager.update_manga(self.db_conn, self.manga_name, self.manga_nownumber, self.nowdate) pass
def handle_starttag(self, tag, attrs): #tmpoutput = "" count = 0 global bDoWork #self.output = "" # Only parse the 'anchor' tag. if tag == "a": # Check the list of defined attributes. for name, value in attrs: # If href is defined, print it. if name == "href": if value[len(value) - 3:len(value)] == "jpg": #print value if not "http://" in value and bDoWork == True: bDoWork = False tmpoutput = value #print "Val: " + value imgurl = 'http://apod.nasa.gov/apod/' + tmpoutput #print "IMGURL: " + imgurl filename = imgurl.split('/')[-1] #print "FileName: " + filename if (not os.path.isfile(filename)) and ('apod.nasa.gov' in imgurl): #print "Downloading: " + filename image = URLopener() image.retrieve(imgurl,filename) sleep(lWaitTime) elif (os.path.isfile(filename)): print "Verified: " + filename break
def download_package(pkg_name, pkg_version): '''Download the required package. Sometimes the download can be flaky, so we use the retry decorator.''' pkg_type = 'sdist' # Don't download wheel archives for now # This JSON endpoint is not provided by PyPI mirrors so we always need to get this # from pypi.python.org. pkg_info = json.loads( urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read()) downloader = URLopener() for pkg in pkg_info['releases'][pkg_version]: if pkg['packagetype'] == pkg_type: filename = pkg['filename'] expected_md5 = pkg['md5_digest'] if os.path.isfile(filename) and check_md5sum( filename, expected_md5): print "File with matching md5sum already exists, skipping %s" % filename return True pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path']) print "Downloading %s from %s" % (filename, pkg_url) downloader.retrieve(pkg_url, filename) actual_md5 = md5(open(filename).read()).hexdigest() if check_md5sum(filename, expected_md5): return True else: print "MD5 mismatch in file %s." % filename return False print "Could not find archive to download for %s %s %s" % ( pkg_name, pkg_version, pkg_type) sys.exit(1)
def download_package(pkg_name, pkg_version): '''Download the required package. Sometimes the download can be flaky, so we use the retry decorator.''' pkg_type = 'sdist' # Don't download wheel archives for now # This JSON endpoint is not provided by PyPI mirrors so we always need to get this # from pypi.python.org. pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read()) downloader = URLopener() for pkg in pkg_info['releases'][pkg_version]: if pkg['packagetype'] == pkg_type: filename = pkg['filename'] expected_md5 = pkg['md5_digest'] if os.path.isfile(filename) and check_md5sum(filename, expected_md5): print "File with matching md5sum already exists, skipping %s" % filename return True pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path']) print "Downloading %s from %s" % (filename, pkg_url) downloader.retrieve(pkg_url, filename) actual_md5 = md5(open(filename).read()).hexdigest() if check_md5sum(filename, expected_md5): return True else: print "MD5 mismatch in file %s." % filename return False print "Could not find archive to download for %s %s %s" % ( pkg_name, pkg_version, pkg_type) sys.exit(1)
def do_method(self, method): method_conf = self.config[method] matchlen = 0 match = None for path in method_conf: if self.is_path_prefix(path) and len(path) > matchlen: matchlen = len(path) match = path if matchlen > 0: self.send_error(method_conf[match]) elif "forward_to" in self.config: url = urljoin(self.config['forward_to'], self.path) self.log_request() self.log_message("Forwarding to {}".format(url)) o = URLopener().open(url) self.wfile.write(o.read()) o.close() elif "*" in method_conf: self.send_error(method_conf['*']) else: print(method.upper(), self.path, self.config['port']) self.log_message( "No match for %s %s on port %d and no default configured" % (method.upper(), self.path, self.config['port'])) self.send_error(404)
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith("http"): url = "http://%s:%s%s" % ( self.session.config.sys.http_host, self.session.config.sys.http_port, ("/" + url).replace("//", "/"), ) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert method in ("GET", "POST") qv, pv = [], [] if method == "POST": which = pv else: which = qv for arg in args[2:]: if "=" in arg: which.append(tuple(arg.split("=", 1))) elif arg.upper()[0] == "P": which = pv elif arg.upper()[0] == "Q": which = qv if qv: qv = urlencode(qv) url += ("?" in url and "&" or "?") + qv # Log us in automagically! httpd = self.session.config.http_worker.httpd global HACKS_SESSION_ID if HACKS_SESSION_ID is None: HACKS_SESSION_ID = httpd.make_session_id(None) mailpile.auth.SetLoggedIn(None, user="******", session_id=HACKS_SESSION_ID) cookie = httpd.session_cookie try: uo = URLopener() uo.addheader("Cookie", "%s=%s" % (cookie, HACKS_SESSION_ID)) with TcpConnBroker().context(need=[TcpConnBroker.OUTGOING_HTTP], oneshot=True): if method == "POST": (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, "rb").read().strip() if data.startswith("{") and "application/json" in hdrs: data = json.loads(data) return self._success("%s %s" % (method, url), result={"headers": hdrs.splitlines(), "data": data}) except: self._ignore_exception() return self._error("%s %s" % (method, url))
def __init__(self, source, proxy = ""): self.source = source if len(proxy) > 0: self._opener = URLopener({"http": proxy}) else: self._opener = URLopener() self._fetchQueue = Queue(0) self._fetchThread = Thread(target = self._FetchTile) self._fetchThread.setDaemon(True) self._fetchThread.start()
def connection(): try: br = URLopener() response = br.open(configuration.WEBSITE_TO_CHECK_CONNECTION).read() if configuration.KEYWORD in response: return 1 else: return 0 except: return 0
def read(self, uri=None, resources=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the metadata and links listed in the sitemapindex. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) self.num_files += 1 except IOError as e: raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri, str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug("Read %d bytes from %s" % (self.content_length, uri)) except KeyError: # If we don't get a length then c'est la vie self.logger.debug("Read ????? bytes from %s" % (uri)) pass self.logger.info("Read sitemap/sitemapindex from %s" % (uri)) s = self.new_sitemap() s.parse_xml(fh=fh, resources=self, capability=self.capability_name) # what did we read? sitemap or sitemapindex? if (s.parsed_index): # sitemapindex if (not self.allow_multifile): raise ListBaseIndexError( "Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info("Parsed as sitemapindex, %d sitemaps" % (len(self.resources))) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): # don't read the component sitemaps self.sitemapindex = True return # now loop over all entries to read each sitemap and add to resources sitemaps = self.resources self.resources = self.resources_class() self.logger.info("Now reading %d sitemaps" % len(sitemaps.uris())) for sitemap_uri in sorted(sitemaps.uris()): self.read_component_sitemap(uri, sitemap_uri, s, sitemapindex_is_file) else: # sitemap self.logger.info("Parsed as sitemap, %d resources" % (len(self.resources)))
def utGrabFromUrl(p_url): """ Takes a file from a remote server """ from urllib import URLopener try: l_opener = URLopener() l_file = l_opener.open(p_url) ctype = l_file.headers['Content-Type'] l_opener.close() return (l_file.read(), ctype) except: return (None, 'text/x-unknown-content-type')
class SlippyCache(object): """This is a basic map tile cache used by the SlippyPanel class to retrieve and store locally the images that form the map""" def __init__(self, source, proxy = ""): self.source = source if len(proxy) > 0: self._opener = URLopener({"http": proxy}) else: self._opener = URLopener() self._fetchQueue = Queue(0) self._fetchThread = Thread(target = self._FetchTile) self._fetchThread.setDaemon(True) self._fetchThread.start() def _FetchTile(self): task = "" while task is not None: task = self._fetchQueue.get() url, fname = task if not os.path.isfile(fname): print "Getting", fname try: self._opener.retrieve(url, "tmp.png") shutil.move("tmp.png", fname) except IOError: pass self._fetchQueue.task_done() def StartNewFetchBatch(self): try: while True: item = self._fetchQueue.get(False) self._fetchQueue.task_done() except Empty: pass def GetTileFilename(self, xtile, ytile, zoom): numTiles = 2 ** zoom while xtile >= numTiles: xtile -= numTiles if xtile < 0 or ytile < 0 or ytile >= numTiles: # Indicate that this is not a valid tile return None else: fname = "/".join([self.source.get_full_name(), str(zoom), str(xtile), str(ytile) + ".png"]) if not os.path.isfile(fname): url = self.source.get_tile_url(xtile, ytile, zoom) # Ensure that the directory exists dname = os.path.dirname(fname) if not os.path.isdir(dname): os.makedirs(dname) self._fetchQueue.put((url, fname)) # Valid tile, though may not be present in the cache return fname
def read(self, uri=None, inventory=None): """Read sitemap from a URI including handling sitemapindexes Returns the inventory. Includes the subtlety that if the input URI is a local file and the """ if (inventory is None): inventory = Inventory() # try: fh = URLopener().open(uri) except IOError as e: raise Exception( "Failed to load sitemap/sitemapindex from %s (%s)" % (uri, str(e))) etree = parse(fh) # check root element: urlset (for sitemap), sitemapindex or bad self.sitemaps_created = 0 if (etree.getroot().tag == '{' + SITEMAP_NS + "}urlset"): self.inventory_parse_xml(etree=etree, inventory=inventory) self.sitemaps_created += 1 elif (etree.getroot().tag == '{' + SITEMAP_NS + "}sitemapindex"): if (not self.allow_multifile): raise Exception( "Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) sitemaps = self.sitemapindex_parse_xml(etree=etree) sitemapindex_is_file = self.is_file_uri(uri) # now loop over all entries to read each sitemap and add to inventory for sitemap_uri in sorted(sitemaps.resources.keys()): if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) else: # FIXME - need checks on sitemap_uri values: # 1. should be in same server/path as sitemapindex URI pass try: fh = URLopener().open(sitemap_uri) except IOError as e: raise Exception( "Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri, uri, str(e))) self.inventory_parse_xml(fh=fh, inventory=inventory) self.sitemaps_created += 1 #print "%s : now have %d resources" % (sitemap_uri,len(inventory.resources)) else: raise ValueError("XML is not sitemap or sitemapindex") return (inventory)
def __init__(self, server, infoFile): """ takes a server location and an info file as parameters in the constructor it will use this server to fetch the new information there should be a json/version and json/info.json dir on this server """ self._infoFile = infoFile self._serverJSON = server + self._infoFile self._serverDate = server + "json/version" if sys.version < '3': self.br = URLopener() else: self.br = request
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert (method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv try: uo = URLopener() if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def read(self, uri=None, resources=None, index_only=False): """Read sitemap from a URI including handling sitemapindexes If index_only is True then individual sitemaps references in a sitemapindex will not be read. This will result in no resources being returned and is useful only to read the metadata and links listed in the sitemapindex. Includes the subtlety that if the input URI is a local file and is a sitemapindex which contains URIs for the individual sitemaps, then these are mapped to the filesystem also. """ try: fh = URLopener().open(uri) self.num_files += 1 except IOError as e: raise IOError("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length self.logger.debug( "Read %d bytes from %s" % (self.content_length,uri) ) except KeyError: # If we don't get a length then c'est la vie self.logger.debug( "Read ????? bytes from %s" % (uri) ) pass self.logger.info( "Read sitemap/sitemapindex from %s" % (uri) ) s = self.new_sitemap() s.parse_xml(fh=fh,resources=self,capability=self.capability_name) # what did we read? sitemap or sitemapindex? if (s.parsed_index): # sitemapindex if (not self.allow_multifile): raise ListBaseIndexError("Got sitemapindex from %s but support for sitemapindex disabled" % (uri)) self.logger.info( "Parsed as sitemapindex, %d sitemaps" % (len(self.resources)) ) sitemapindex_is_file = self.is_file_uri(uri) if (index_only): # don't read the component sitemaps self.sitemapindex = True return # now loop over all entries to read each sitemap and add to resources sitemaps = self.resources self.resources = self.resources_class() self.logger.info( "Now reading %d sitemaps" % len(sitemaps.uris()) ) for sitemap_uri in sorted(sitemaps.uris()): self.read_component_sitemap(uri,sitemap_uri,s,sitemapindex_is_file) else: # sitemap self.logger.info( "Parsed as sitemap, %d resources" % (len(self.resources)) )
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert(method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv try: uo = URLopener() if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def test_05_03_http_image_zipfile(self): # Make a zipfile using files accessed from the web def alter_fn(module): self.assertTrue(isinstance(module, C.CreateWebPage)) module.wants_zip_file.value = True module.zipfile_name.value = ZIPFILE_NAME module.directory_choice.dir_choice = C.ABSOLUTE_FOLDER_NAME module.directory_choice.custom_path = cpprefs.get_default_image_directory() url_root = "http://cellprofiler.org/svnmirror/ExampleImages/ExampleSBSImages/" url_query = "?r=11710" filenames = [(url_root, fn + url_query) for fn in ("Channel1-01-A-01.tif", "Channel2-01-A-01.tif", "Channel1-02-A-02.tif", "Channel2-02-A-02.tif")] # # Make sure URLs are accessible # try: for filename in filenames: URLopener().open("".join(filename)).close() except IOError, e: def bad_url(e=e): raise e unittest.expectedFailure(bad_url)()
def download_package(pkg_name, pkg_version): file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version) if not file_name: return False if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5): print 'File with matching md5sum already exists, skipping {0}'.format( file_name) return True downloader = URLopener() pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path) print 'Downloading {0} from {1}'.format(file_name, pkg_url) downloader.retrieve(pkg_url, file_name) if check_md5sum(file_name, expected_md5): return True else: print 'MD5 mismatch in file {0}.'.format(file_name) return False
def __init__(self,manga_name): self.manga_name = manga_name self.myfile = open(configuration.DATA_FILE,'r').read() self.manga_oldnumber = self.get_number() self.manga_nownumber = self.manga_oldnumber self.manga_olddate = self.get_date () self.nowdate = self.today_date() self.br = URLopener()
def download(self, sysctl, code): try: logging.info('Begin download files.') if not isdir(self.p_dwld): mkdir(self.p_dwld) obj = URLopener() for f in self.files: logging.info('Start download {}.'.format(f)) obj.retrieve(self.url + f, self.p_dwld + f) logging.info('Download {} done.'.format(f)) return True except BaseException as down: logging.error('Download {}.'.format(down)) self._rolback(sysctl, code)
def test_static_directory(): py.test.skip("Fails") import thread tmpdir = py.test.ensuretemp("server_static_dir") tmpdir.ensure("a", dir=1) tmpdir.join("a").ensure("a.txt").write("aaa") tmpdir.join("a").ensure("b.txt").write("bbb") class StaticDir(server.Handler): static_dir = tmpdir a_dir = server.StaticDir(tmpdir.join("a")) httpd = server.HTTPServer(('127.0.0.1', 0), StaticDir) port = httpd.server_port thread.start_new_thread(httpd.serve_forever, ()) addr = "http://127.0.0.1:%d/" % port assert URLopener().open(addr + "a_dir/a.txt").read() == "aaa" assert URLopener().open(addr + "a_dir/b.txt").read() == "bbb"
def startplayback_images(args): """Shows an image """ # cache path sDir = xbmc.translatePath(args._addon.getAddonInfo("profile")) if args.PY2: sPath = join(sDir.decode("utf-8"), u"image.jpg") else: sPath = join(sDir, "image.jpg") # download image file = URLopener() file.retrieve(args.url, sPath) # display image item = xbmcgui.ListItem(getattr(args, "title", "Title not provided"), path=sPath) xbmcplugin.setResolvedUrl(int(args._argv[1]), True, item) xbmc.executebuiltin("SlideShow(" + sDir + ")")
def download_package(pkg_name, pkg_version): file_name, path, hash_algorithm, expected_digest = get_package_info(pkg_name, pkg_version) if not file_name: return False if os.path.isfile(file_name) and check_digest(file_name, hash_algorithm, expected_digest): print 'File with matching digest already exists, skipping {0}'.format(file_name) return True downloader = URLopener() pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path) print 'Downloading {0} from {1}'.format(file_name, pkg_url) downloader.retrieve(pkg_url, file_name) if check_digest(file_name, hash_algorithm, expected_digest): return True else: print 'Hash digest check failed in file {0}.'.format(file_name) return False
def open_http(url, data=None): """Use HTTP protocol.""" user_passwd = None proxy_passwd = None if isinstance(url, str): host, selector = splithost(url) if host: user_passwd, host = splituser(host) host = urllib.unquote(host) realhost = host else: host, selector = url # check whether the proxy contains authorization information proxy_passwd, host = splituser(host) # now we proceed with the url we want to obtain urltype, rest = urllib.splittype(selector) url = rest user_passwd = None if urltype.lower() != 'http': realhost = None else: realhost, rest = splithost(rest) if realhost: user_passwd, realhost = splituser(realhost) if user_passwd: selector = "%s://%s%s" % (urltype, realhost, rest) if urllib.proxy_bypass(realhost): host = realhost #print "proxy via http:", host, selector if not host: raise IOError('http error', 'no host given') if proxy_passwd: import base64 proxy_auth = base64.b64encode(proxy_passwd).strip() else: proxy_auth = None if user_passwd: import base64 auth = base64.b64encode(user_passwd).strip() else: auth = None c = FakeHTTPConnection(host) if data is not None: c.putrequest('POST', selector) c.putheader('Content-Type', 'application/x-www-form-urlencoded') c.putheader('Content-Length', '%d' % len(data)) else: c.putrequest('GET', selector) if proxy_auth: c.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) if auth: c.putheader('Authorization', 'Basic %s' % auth) if realhost: c.putheader('Host', realhost) for args in URLopener().addheaders: c.putheader(*args) c.endheaders() return c
def download_reports(years=_years, weeks=_weeks): '''Crawls through IMoH website and download all excel files in the given weeks and years''' # Create paths for logging files and download loaction prefix = datetime.now().strftime('./log/weeklies/%y%m%d_%H%M%S_') log_d = prefix + "downloads.log" log_f = prefix + "FAILED.log" base_loc = 'http://www.health.gov.il/PublicationsFiles/IWER' # URL object my_file = URLopener() for year in years: print "\n", year, for week in weeks: f = open(log_d, 'a') f.write('\n{year}_{week}: '.format(week=week, year=year)) # There are many different options of paths options = ['{base}{week:02d}_{year}.xls'.format(base=base_loc, week=week, year=year), '{base}{week}_{year}.xls'.format(base=base_loc, week=week, year=year), '{base}{week:02d}_{year}.xlsx'.format(base=base_loc, week=week, year=year), '{base}{week}_{year}.xlsx'.format(base=base_loc, week=week, year=year)] for i, o in enumerate(options): filetype = o.split(".")[-1] try: # Try different paths on remote, but always save on same path locally my_file.retrieve(o, './data/weeklies/{year}_{week:02d}.{ft}'.format(week=week, year=year, ft=filetype)) # If succeeds write which filetype (xls/x) was saved f.write('{ft}'.format(ft=filetype), ) # If downloads succeeds move close the log file and break the loop f.close() break except: # When option excepted, write try number to the log f.write("{} ".format(i + 1)) # If all options were exhausted, it has failed. if i == len(options) - 1 and week != 53: print "== {year}_{week:02d} FAILED ==".format(week=week, year=year), with open(log_f, 'a') as failed: failed.write("{year}_{week:02d} FAILED\n".format(week=week, year=year)) f.write("FAILED") f.close() f.close()
def call_remote(self, category, params): ''' The meetup api is set up such that the root url does not change much other than the'name' of the thing you call into. In other words, I can just use category to sprintf my way to a valid url, then tack on the rest of the query string specified in params. ''' url = self.root_url url = url % (category) # Every call has to include key url = url + "?" + params + "&key=" + self.key client = URLopener() request = client.open(url) raw_str = request.read() results = json.loads(raw_str) # Let the caller interpret the results of the call. Both the # meta info and the results are passed back return results
def call_remote(self,category,params): ''' The meetup api is set up such that the root url does not change much other than the'name' of the thing you call into. In other words, I can just use category to sprintf my way to a valid url, then tack on the rest of the query string specified in params. ''' url = self.root_url url = url % (category) # Every call has to include key url = url + "?" + params + "&key=" + self.key client = URLopener() request = client.open(url) raw_str = request.read() results = json.loads(raw_str) # Let the caller interpret the results of the call. Both the # meta info and the results are passed back return results
class Updater: """ takes a server location and an info file as parameters in the constructor it will use this server to fetch the new information there should be a /hash and /info.json dir on this server """ def __init__(self,server,infoFile): self._server = server self._infoFile = infoFile self.br = URLopener() """ hasNewInfo :: Boolean compare the local info file hash with the one found on the server and returns true if they are different """ def hasNewInfo(self): f = open(self._infoFile,'r').read() m = md5.new(f).hexdigest() response = self.br.open(self._server+'/hash').read() response = response.replace("\n","") return (m!=response) """ generateTimeStamp :: String returns a string that is used to timestamp old config backup files """ def generateTimeStamp(self): return str(time.gmtime().tm_year)+"_"+str(time.gmtime().tm_mday)+"_"+str(time.gmtime().tm_hour)+"_"+str(time.gmtime().tm_min) """ fetchNewInfo :: Void it will download the info file from the server use the timestamp to back it up and overwrite it """ def fetchNewInfo(self): response = self.br.open(self._server+'/info.json').read() oldInfo = open(self._infoFile,'r').read() open(self._infoFile+"."+self.generateTimeStamp(),'w').write(oldInfo) open(self._infoFile,'w').write(response)
def read_component_sitemap(self, sitemapindex_uri, sitemap_uri, sitemap, sitemapindex_is_file): """Read a component sitemap of a Resource List with index Each component must be a sitemap with the """ if (sitemapindex_is_file): if (not self.is_file_uri(sitemap_uri)): # Attempt to map URI to local file remote_uri = sitemap_uri sitemap_uri = self.mapper.src_to_dst(remote_uri) self.logger.info("Mapped %s to local file %s" % (remote_uri, sitemap_uri)) else: # The individual sitemaps should be at a URL (scheme/server/path) # that the sitemapindex URL can speak authoritatively about if (self.check_url_authority and not UrlAuthority( sitemapindex_uri).has_authority_over(sitemap_uri)): raise ListBaseIndexError( "The sitemapindex (%s) refers to sitemap at a location it does not have authority over (%s)" % (sitemapindex_uri, sitemap_uri)) try: fh = URLopener().open(sitemap_uri) self.num_files += 1 except IOError as e: raise ListBaseIndexError( "Failed to load sitemap from %s listed in sitemap index %s (%s)" % (sitemap_uri, sitemapindex_uri, str(e))) # Get the Content-Length if we can (works fine for local files) try: self.content_length = int(fh.info()['Content-Length']) self.bytes_read += self.content_length except KeyError: # If we don't get a length then c'est la vie pass self.logger.info("Reading sitemap from %s (%d bytes)" % (sitemap_uri, self.content_length)) component = sitemap.parse_xml(fh=fh, sitemapindex=False) # Copy resources into self, check any metadata for r in component: self.resources.add(r)
class Updater: def __init__(self,server,infoFile): self._server = server self._infoFile = infoFile self.br = URLopener() def hasNewInfo(self): f = open(self._infoFile,'r').read() m = md5.new(f).hexdigest() response = self.br.open(self._server+'/hash').read() response = response.replace("\n","") return (m!=response) def generateTimeStamp(self): return str(time.gmtime().tm_year)+"_"+str(time.gmtime().tm_mday)+"_"+str(time.gmtime().tm_hour)+"_"+str(time.gmtime().tm_min) def fetchNewInfo(self): response = self.br.open(self._server+'/info.json').read() oldInfo = open(self._infoFile,'r').read() open(self._infoFile+"."+self.generateTimeStamp(),'w').write(oldInfo) open(self._infoFile,'w').write(response)
def get(self,url,inventory=None): """Get a inventory from url Will either create a new Inventory object or add to one supplied. """ # Either use inventory passed in or make a new one if (inventory is None): inventory = Inventory() inventory_fh = URLopener().open(url) Sitemap().inventory_parse_xml(fh=inventory_fh, inventory=inventory) return(inventory)
def __init__(self,manga_name, db_conn): self.db_conn = db_conn self.manga_name = manga_name self.manga_oldnumber = sqlite_manager.get_manga_chapter( db_conn, manga_name) self.manga_nownumber = self.manga_oldnumber self.manga_olddate = sqlite_manager.get_manga_date( db_conn, manga_name) self.nowdate = self.today_date() self.br = URLopener()
def download_if_not_exist(url, target_file): if not os.path.isfile(target_file): get_logger().info('downloading %s to %s', url, target_file) makedirs(os.path.dirname(target_file), exists_ok=True) temp_filename = target_file + '.part' if os.path.isfile(temp_filename): os.remove(temp_filename) URLopener().retrieve(url, temp_filename) os.rename(temp_filename, target_file) return target_file
def command(self): args = list(self.args) method, url = args[0:2] if not url.startswith('http'): url = 'http://%s:%s%s' % (self.session.config.sys.http_host, self.session.config.sys.http_port, ('/' + url).replace('//', '/')) # FIXME: The python URLopener doesn't seem to support other verbs, # which is really quite lame. method = method.upper() assert (method in ('GET', 'POST')) qv, pv = [], [] if method == 'POST': which = pv else: which = qv for arg in args[2:]: if '=' in arg: which.append(tuple(arg.split('=', 1))) elif arg.upper()[0] == 'P': which = pv elif arg.upper()[0] == 'Q': which = qv if qv: qv = urlencode(qv) url += ('?' in url and '&' or '?') + qv # Log us in automagically! httpd = self.session.config.http_worker.httpd global HACKS_SESSION_ID if HACKS_SESSION_ID is None: HACKS_SESSION_ID = httpd.make_session_id(None) mailpile.auth.SetLoggedIn(None, user='******', session_id=HACKS_SESSION_ID) cookie = httpd.session_cookie try: uo = URLopener() uo.addheader('Cookie', '%s=%s' % (cookie, HACKS_SESSION_ID)) if method == 'POST': (fn, hdrs) = uo.retrieve(url, data=urlencode(pv)) else: (fn, hdrs) = uo.retrieve(url) hdrs = unicode(hdrs) data = open(fn, 'rb').read().strip() if data.startswith('{') and 'application/json' in hdrs: data = json.loads(data) return self._success('%s %s' % (method, url), result={ 'headers': hdrs.splitlines(), 'data': data }) except: self._ignore_exception() return self._error('%s %s' % (method, url))
def __init__(self,manga_name,chapter,end_chapter,manga_location,dl_manager): self.manga_location = manga_location self.manga_name = manga_name self.chapter = chapter self.end_chapter = end_chapter self.current_image = "000" self.img = "" self.imgs = [] self.chapters = [] self.br = URLopener() self.response = "" self.response_lines = "" self.dl_manager = dl_manager
def __init__(self): global dbaselocal global datapath fname = datapath + 'TRMM_classmap.dat' print 'Loading class map ',fname if dbaselocal: landclassmap.data = np.loadtxt(fname, dtype='int')[:,1] else: f = URLopener().open(fname) tmp = [] for line in f: columns = line.split() tmp.append(int(columns[1])) f.close() landclassmap.data = np.array(tmp) landclassmap.data = np.reshape(landclassmap.data, (-1, 360)) print 'Class map loaded'
def test_static_page(): import thread tmpdir = py.test.ensuretemp("server_static_page") tmpdir.ensure("test.html").write("<html></html>") class StaticHandler(server.TestHandler): static_dir = str(tmpdir) index = server.Static(os.path.join(static_dir, "test.html")) httpd = server.HTTPServer(('127.0.0.1', 21212), StaticHandler) thread.start_new_thread(httpd.serve_forever, ()) assert URLopener().open("http://127.0.0.1:21212/index").read() == \ "<html></html>"
def getRetriever(scheme): """ Get the right retriever function depending on the scheme. If scheme is 'http' return urllib.urlretrieve, else if the scheme is https create a URLOpener with certificates taken from the X509_USER_PROXY variable. If certificates are not available return urllib.urlretrieve as for the http case. """ if os.environ.has_key('X509_USER_PROXY') and os.path.isfile(os.environ['X509_USER_PROXY']): certfile = os.environ['X509_USER_PROXY'] else: if scheme == 'https': print "User proxy not found. Trying to retrieve the file without using certificates" certfile = None if scheme == 'http' or not certfile: retriever = urllib.urlretrieve else: print "Using %s as X509 certificate" % certfile op = URLopener(None, key_file=certfile, cert_file=certfile) op.addheader( 'Accept', 'application/octet-stream' ) retriever = op.retrieve return retriever
def test_static_page_implicit(): import thread tmpdir = py.test.ensuretemp("server_static_page_implicit") tmpdir.ensure("index.html").write("<html></html>") class StaticHandler(server.TestHandler): static_dir = str(tmpdir) index = server.Static() server.patch_handler(StaticHandler) httpd = server.HTTPServer(('127.0.0.1', 21213), StaticHandler) thread.start_new_thread(httpd.serve_forever, ()) assert URLopener().open("http://127.0.0.1:21213/index").read() == \ "<html></html>"
class Updater: def __init__(self, server, infoFile): """ takes a server location and an info file as parameters in the constructor it will use this server to fetch the new information there should be a json/version and json/info.json dir on this server """ self._infoFile = infoFile self._serverJSON = server + self._infoFile self._serverDate = server + "json/version" if sys.version < '3': self.br = URLopener() else: self.br = request def hasNewInfo(self): """ hasNewInfo :: Boolean compare the local version tag with the one found on the server and returns true if the server version is newer """ jsonDate = open(location_manager.VERSION, 'r').read().strip() if sys.version < '3': servDate = self.br.open(self._serverDate).read().strip() else: servDate = self.br.urlopen(self._serverDate).read().strip() return (int(jsonDate) < int(servDate)) def generateTimeStamp(self): """ generateTimeStamp :: String returns a string that is used to timestamp old config backup files """ return open(location_manager.VERSION, 'r').read().strip() def fetchNewInfo(self): """ fetchNewInfo :: Void it will download the info file from the server use the timestamp to back it up and overwrite it """ # Fetching server's info.json if sys.version < '3': response = self.br.open(self._serverJSON).read() else: response = self.br.urlopen(self._serverJSON).read().decode("utf-8") oldInfo = open(self._infoFile, 'r').read() open(self._infoFile + "." + self.generateTimeStamp(), 'w').write(oldInfo) open(self._infoFile, 'w').write(response) # Fetching server's version if sys.version < '3': servDate = int(self.br.open(self._serverDate).read().strip()) else: servDate = int(self.br.urlopen(self._serverDate).read().strip()) open(location_manager.VERSION, 'w').write(str(servDate))
def download__grobid_service_zip_if_not_exist(self): if not os.path.isfile(self.grobid_service_zip_filename): get_logger().info( 'downloading %s to %s', self.grobid_service_zip_url, self.grobid_service_zip_filename ) makedirs(os.path.dirname(self.grobid_service_zip_filename), exists_ok=True) temp_zip_filename = self.grobid_service_zip_filename + '.part' if os.path.isfile(temp_zip_filename): os.remove(temp_zip_filename) URLopener().retrieve(self.grobid_service_zip_url, temp_zip_filename) os.rename(temp_zip_filename, self.grobid_service_zip_filename)
class Updater: def __init__(self, server, infoFile): """ takes a server location and an info file as parameters in the constructor it will use this server to fetch the new information there should be a json/version and json/info.json dir on this server """ self._infoFile = infoFile self._serverJSON = server + self._infoFile self._serverDate = server + "json/version" if sys.version < '3': self.br = URLopener() else: self.br = request def hasNewInfo(self): """ hasNewInfo :: Boolean compare the local version tag with the one found on the server and returns true if the server version is newer """ jsonDate = open(location_manager.VERSION , 'r').read().strip() if sys.version < '3': servDate = self.br.open(self._serverDate).read().strip() else: servDate = self.br.urlopen(self._serverDate).read().strip() return (int(jsonDate) < int(servDate)) def generateTimeStamp(self): """ generateTimeStamp :: String returns a string that is used to timestamp old config backup files """ return open(location_manager.VERSION, 'r').read().strip() def fetchNewInfo(self): """ fetchNewInfo :: Void it will download the info file from the server use the timestamp to back it up and overwrite it """ # Fetching server's info.json if sys.version < '3': response = self.br.open(self._serverJSON).read() else: response = self.br.urlopen(self._serverJSON).read().decode("utf-8") oldInfo = open(self._infoFile, 'r').read() open(self._infoFile + "." + self.generateTimeStamp(), 'w').write(oldInfo) open(self._infoFile, 'w').write(response) # Fetching server's version if sys.version < '3': servDate = int(self.br.open(self._serverDate).read().strip()) else: servDate = int(self.br.urlopen(self._serverDate).read().strip()) open(location_manager.VERSION, 'w').write(str(servDate))
def maybe_download_tesst_image(file_name): '''Download the given TestImages file if not in the directory file_name - name of file to fetch Image will be downloaded if not present to CP_EXAMPLEIMAGES directory. ''' local_path = os.path.join(testimages_directory(), file_name) if not os.path.exists(local_path): url = testimages_url() + "/" + file_name try: URLopener().retrieve(url, local_path) except IOError, e: # This raises the "expected failure" exception. def bad_url(e=e): raise e unittest.expectedFailure(bad_url)()
def parse(self,uri=None,fh=None,str=None): """Parse a single XML document for this list Accepts either a uri (uri or default if parameter not specified), or a filehandle (fh) or a string (str). Does not handle the case of sitemapindex+sitemaps """ if (uri is not None): try: fh = URLopener().open(uri) except IOError as e: raise Exception("Failed to load sitemap/sitemapindex from %s (%s)" % (uri,str(e))) elif (str is not None): fh=StringIO.StringIO(str) if (fh is None): raise Exception("Nothing to parse") s = self.new_sitemap() s.parse_xml(fh=fh,resources=self,capability=self.capability_name,sitemapindex=False) self.parsed_index = s.parsed_index
def try_download(_path, _file, _url, _stale,): now = time() url = URLopener() file_exists = isfile(_path+_file) == True if file_exists: file_old = (getmtime(_path+_file) + _stale) < now if not file_exists or (file_exists and file_old): try: url.retrieve(_url, _path+_file) result = 'ID ALIAS MAPPER: \'{}\' successfully downloaded'.format(_file) except IOError: result = 'ID ALIAS MAPPER: \'{}\' could not be downloaded'.format(_file) else: result = 'ID ALIAS MAPPER: \'{}\' is current, not downloaded'.format(_file) url.close() return result
def open(self, *args): f = URLopener.open(self, *args) return XML(f)
def http_error_default(*a, **k): return URLopener.http_error_default(*a, **k)
def __init__(self,server,infoFile): self._server = server self._infoFile = infoFile self.br = URLopener()
def test_ping_play1(): from urllib import URLopener u = URLopener() text = "<title>pypy.js various demos</title>" assert u.open("http://play1.pypy.org/").read().find(text) != -1
#!/usr/bin/env python from re import sub from BeautifulSoup import BeautifulSoup from urllib import URLopener opener = URLopener() html = opener.open('http://www.dailyzen.com/').read() html = html[html.index('<!--Add Quote for correct day-->'):] html1 = html[:html.index('<br>')] html2 = html[html.index('<A class="artist">'):] html2 = html2[:html2.index('</a></i>')] html2 = sub('<A class="artist">','',html2).strip() zen = BeautifulSoup(html1) zen = zen.prettify().strip() for x in ['<!--Add Quote for correct day-->','<br />','<p>','</p>','^\n','\n$']: zen = sub(x,'',zen).strip() zen = sub('\n \n \n','\n \n',zen).strip() print print zen print print '\t\t',html2