def download_and_extract_from_fallback_url(fallback_url, filename, relative_src_dir, download_dir, extract_dir, sha1_hash_expected=None): p = urlparse(fallback_url) new_path = p[2] + "/" + relative_src_dir + "/" + filename fallback_download_url = urlunparse( [p[0], p[1], new_path, p[3], p[4], p[5]]) download_filename = eos.util.download_file( fallback_download_url, download_dir, sha1_hash_expected=sha1_hash_expected) if download_filename == "": eos.log_error("downloading of file from fallback URL " + fallback_download_url + " failed") return False if os.path.exists(extract_dir): shutil.rmtree(extract_dir) if not eos.archive.extract_file(download_filename, extract_dir): eos.log_error("extraction of file " + download_filename + " from fallback URL " + fallback_download_url + " failed") return False return True
def get_parser(self, lwa): # https://stackoverflow.com/questions/15799696/library-to-build-urls-in-python clwa = list(urlparse(lwa)) for cid, cname in enumerate(self.parsers): if cname.__name__.lower() in clwa[1]: clwa[2] = '' return (cid, self.parsers[cid], urlunparse(clwa))
def query_joiner(text=None): components = OrderedDict([('language', 'en'), ('text', text)]) query_items = list(components.items()) query = '&'.join(['='.join(field) for field in query_items]) terms = OrderedDict( [('scheme', 'http'), ('netloc', 'conceptnet5.media.mit.edu'), ('path', '/data/5.4/uri'), ('params', ''), ('query', query), ('fragment', '')]) return urlunparse(terms.values())
def downloadFile(url, download_dir, target_dir_name, sha1_hash=None, force_download=False, user_agent=DEFAULT_USER_AGENT): if not os.path.isdir(download_dir): os.mkdir(download_dir) p = urlparse(url) url = urlunparse([p[0], p[1], quote(p[2]), p[3], p[4], p[5]]) # replace special characters in the URL path filename_rel = os.path.split(p.path)[1] # get original filename target_filename = os.path.join(download_dir, filename_rel) # check SHA1 hash, if file already exists if os.path.exists( target_filename) and sha1_hash is not None and sha1_hash != "": hash_file = computeFileHash(target_filename) if hash_file != sha1_hash: log("Hash of " + target_filename + " (" + hash_file + ") does not match expected hash (" + sha1_hash + "); forcing download") force_download = True # download file if (not os.path.exists(target_filename)) or force_download: log("Downloading " + url + " to " + target_filename) if p.scheme == "ssh": downloadSCP(p.hostname, p.username, p.path, download_dir) else: if user_agent is not None: opener = urllib.request.build_opener() opener.addheaders = [('User-agent', user_agent)] f = open(target_filename, 'wb') f.write(opener.open(url).read()) f.close() else: urlretrieve(url, target_filename) else: log("Skipping download of " + url + "; already downloaded") # check SHA1 hash if sha1_hash is not None and sha1_hash != "": hash_file = computeFileHash(target_filename) if hash_file != sha1_hash: raise RuntimeError("Hash of " + target_filename + " (" + hash_file + ") differs from expected hash (" + sha1_hash + ")") return target_filename
def main(): host="www.google.co.kr" conn = http.client.HTTPConnection(host) conn.request("GET", '') resp = conn.getresponse() data = resp.read().decode(resp.headers.get_content_charset()) conn.close() url = urlunparse(('http',host,'','','','')) print(url) downloadImage(url,data)
def main(): host = "www.google.com" conn = client.HTTPConnection(host) conn.request("GET", '') resp = conn.getresponse() charset = resp.headers.get_content_charset() data = resp.read().decode(charset) conn.close() print("\n>>>>>>>>> Download Images from", host) url = request.urlunparse(('http', host, '', '', '', '')) downloadImage(url, data)
def parse_images(url, page, out_folder): soup = bs(page, "html.parser") [x.extract() for x in soup.findAll('script')] parsed = list(urlparse(url)) for image in soup.findAll("img"): print("Image: %(src)s" % image) filename = image["src"].split("/")[-1] parsed[2] = image["src"] outpath = os.path.join(out_folder, "images", filename) if image["src"].lower().startswith("http"): urlretrieve(image["src"], outpath) else: urlretrieve(urlunparse(parsed), outpath)
def main(url, out_folder="/test/"): """Downloads all the images at 'url' to /test/""" soup = bs(urlopen(url), features='html.parser') parsed = list(urlparse(url)) for image in soup.findAll("img"): print("Image: %(src)s" % image) filename = image["src"].split("/")[-1] parsed[2] = image["src"] outpath = os.path.join(out_folder, filename) if image["src"].lower().startswith("http"): urlretrieve(image["src"], outpath) else: urlretrieve(urlunparse(parsed), outpath)
def construct_url(query, max_position=None): """ For a given query, will construct a URL to search Twitter with :param query: The query term used to search twitter :param max_position: The max_position value to select the next pagination of tweets :return: A string URL """ params = { # Type Param 'f': 'tweets', # Query Param 'q': query } # If our max_position param is not None, we add it to the parameters if max_position is not None: params['max_position'] = max_position url_tuple = ('https', 'twitter.com', '/i/search/timeline', '', urlencode(params), '') return urlunparse(url_tuple)
def downloadFile(url, download_dir, target_dir_name, sha1_hash = None, force_download = False, user_agent = None): if not os.path.isdir(download_dir): os.mkdir(download_dir) p = urlparse(url) url = urlunparse([p[0], p[1], quote(p[2]), p[3], p[4], p[5]]) # replace special characters in the URL path filename_rel = os.path.split(p.path)[1] # get original filename target_filename = os.path.join(download_dir, filename_rel) # check SHA1 hash, if file already exists if os.path.exists(target_filename) and sha1_hash is not None and sha1_hash != "": hash_file = computeFileHash(target_filename) if hash_file != sha1_hash: log("Hash of " + target_filename + " (" + hash_file + ") does not match expected hash (" + sha1_hash + "); forcing download") force_download = True # download file if (not os.path.exists(target_filename)) or force_download: log("Downloading " + url + " to " + target_filename) if p.scheme == "ssh": downloadSCP(p.hostname, p.username, p.path, download_dir) else: if user_agent is not None: MyURLOpener.version = user_agent MyURLOpener().retrieve(url, target_filename) else: urlretrieve(url, target_filename) else: log("Skipping download of " + url + "; already downloaded") # check SHA1 hash if sha1_hash is not None and sha1_hash != "": hash_file = computeFileHash(target_filename) if hash_file != sha1_hash: raise RuntimeError("Hash of " + target_filename + " (" + hash_file + ") differs from expected hash (" + sha1_hash + ")") return target_filename
def sanitize_url(url): p = urlparse(url) url = urlunparse([p[0], p[1], quote(p[2]), p[3], p[4], p[5]]) # quote special characters in the path return url
def main(argv): global BASE_DIR, SRC_DIR, ARCHIVE_DIR, DEBUG_OUTPUT, FALLBACK_URL, USE_TAR, USE_UNZIP global TOOL_COMMAND_PYTHON, TOOL_COMMAND_GIT, TOOL_COMMAND_HG, TOOL_COMMAND_SVN, TOOL_COMMAND_PATCH, TOOL_COMMAND_TAR, TOOL_COMMAND_UNZIP try: opts, args = getopt.getopt( argv, "ln:N:cCb:h", ["list", "name=", "name-file=", "clean", "clean-all", "base-dir", "bootstrap-file=", "local-bootstrap-file=", "use-tar", "use-unzip", "repo-snapshots", "fallback-url=", "force-fallback", "debug-output", "help"]) except getopt.GetoptError: printOptions() return 0 opt_names = [] name_files = [] opt_clean = False opt_clean_archives = False list_libraries = False default_bootstrap_filename = "bootstrap.json" bootstrap_filename = os.path.abspath(os.path.join(BASE_DIR, default_bootstrap_filename)) local_bootstrap_filename = "" create_repo_snapshots = False force_fallback = False base_dir_path = "" for opt, arg in opts: if opt in ("-h", "--help"): printOptions() return 0 if opt in ("-l", "--list"): list_libraries = True if opt in ("-n", "--name"): opt_names.append(arg) if opt in ("-N", "--name-file"): name_files.append(os.path.abspath(arg)) if opt in ("-c", "--clean"): opt_clean = True if opt in ("-C", "--clean-all"): opt_clean = True opt_clean_archives = True if opt in ("-b", "--base-dir"): base_dir_path = os.path.abspath(arg) BASE_DIR = base_dir_path SRC_DIR = os.path.join(BASE_DIR, SRC_DIR_BASE) ARCHIVE_DIR = os.path.join(BASE_DIR, ARCHIVE_DIR_BASE) bootstrap_filename = os.path.join(BASE_DIR, default_bootstrap_filename) log("Using " + arg + " as base directory") if opt in ("--bootstrap-file",): bootstrap_filename = os.path.abspath(arg) log("Using main bootstrap file " + bootstrap_filename) if opt in ("--local-bootstrap-file",): local_bootstrap_filename = os.path.abspath(arg) log("Using local bootstrap file " + local_bootstrap_filename) if opt in ("--use-tar",): USE_TAR = True if opt in ("--use-unzip",): USE_UNZIP = True if opt in ("--repo-snapshots",): create_repo_snapshots = True log("Will create repository snapshots") if opt in ("--fallback-url",): FALLBACK_URL = arg if opt in ("--force-fallback",): force_fallback = True log("Using fallback URL to fetch all libraries") if opt in ("--debug-output",): DEBUG_OUTPUT = True if platform.system() is not "Windows": # Unfortunately some IDEs do not have a proper PATH environment variable set, # so we search manually for the required tools in some obvious locations. paths_to_search = os.environ["PATH"].split(":") + ["/usr/local/bin", "/opt/local/bin", "/usr/bin"] TOOL_COMMAND_PYTHON = findToolCommand(TOOL_COMMAND_PYTHON, paths_to_search, required = True) TOOL_COMMAND_GIT = findToolCommand(TOOL_COMMAND_GIT, paths_to_search, required = True) TOOL_COMMAND_HG = findToolCommand(TOOL_COMMAND_HG, paths_to_search, required = True) TOOL_COMMAND_SVN = findToolCommand(TOOL_COMMAND_SVN, paths_to_search, required = True) TOOL_COMMAND_PATCH = findToolCommand(TOOL_COMMAND_PATCH, paths_to_search, required = True) TOOL_COMMAND_TAR = findToolCommand(TOOL_COMMAND_TAR, paths_to_search, required = USE_TAR) TOOL_COMMAND_UNZIP = findToolCommand(TOOL_COMMAND_UNZIP, paths_to_search, required = USE_UNZIP) if base_dir_path: os.chdir(base_dir_path) if name_files: for name_file in name_files: try: with open(name_file) as f: opt_names_local = [l for l in (line.strip() for line in f) if l] opt_names_local = [l for l in opt_names_local if l[0] is not '#'] opt_names += opt_names_local dlog("Name file contains: " + ", ".join(opt_names_local)) except: log("ERROR: cannot parse name file " + name_file) return -1 if force_fallback and not FALLBACK_URL: log("Error: cannot force usage of the fallback location without specifying a fallback URL") return -1; state_filename = os.path.join(os.path.dirname(os.path.splitext(bootstrap_filename)[0]), \ "." + os.path.basename(os.path.splitext(bootstrap_filename)[0])) \ + os.path.splitext(bootstrap_filename)[1] dlog("bootstrap_filename = " + bootstrap_filename) dlog("state_filename = " + state_filename) # read canonical libraries data data = readJSONData(bootstrap_filename) if data is None: return -1; # some sanity checking for library in data: if library.get('name', None) is None: log("ERROR: Invalid schema: library object does not have a 'name'") return -1 # read local libraries data, if available local_data = None if local_bootstrap_filename: local_data = readJSONData(local_bootstrap_filename) if local_data is None: return -1; # some sanity checking for local_library in local_data: if local_library.get('name', None) is None: log("ERROR: Invalid schema: local library object does not have a 'name'") return -1 # merge canonical and local library data, if applicable; local libraries take precedence if local_data is not None: for local_library in local_data: local_name = local_library.get('name', None) found_canonical_library = False for n, library in enumerate(data): name = library.get('name', None) if local_name == name: data[n] = local_library # overwrite library found_canonical_library = True if not found_canonical_library: data.append(local_library) if list_libraries: listLibraries(data) return 0 sdata = [] if os.path.exists(state_filename): sdata = readJSONData(state_filename) # create source directory if not os.path.isdir(SRC_DIR): log("Creating directory " + SRC_DIR) os.mkdir(SRC_DIR) # create archive files directory if not os.path.isdir(ARCHIVE_DIR): log("Creating directory " + ARCHIVE_DIR) os.mkdir(ARCHIVE_DIR) failed_libraries = [] for library in data: name = library.get('name', None) source = library.get('source', None) post = library.get('postprocess', None) if (opt_names) and (not name in opt_names): continue lib_dir = os.path.join(SRC_DIR, name) dlog("********** LIBRARY " + name + " **********") dlog("lib_dir = " + lib_dir + ")") # compare against cached state cached_state_ok = False if not opt_clean: for slibrary in sdata: sname = slibrary.get('name', None) if sname is not None and sname == name and slibrary == library and os.path.exists(lib_dir): cached_state_ok = True break if cached_state_ok: log("Cached state for " + name + " equals expected state; skipping library") continue else: # remove cached state for library sdata[:] = [s for s in sdata if not (lambda s, name : s.get('name', None) is not None and s['name'] == name)(s, name)] # create library directory, if necessary if opt_clean: log("Cleaning directory for " + name) if os.path.exists(lib_dir): shutil.rmtree(lib_dir) if not os.path.exists(lib_dir): os.mkdir(lib_dir) try: # download source if source is not None: if 'type' not in source: log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'type'") return -1 if 'url' not in source: log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'url'") return -1 src_type = source['type'] src_url = source['url'] if src_type == "sourcefile": sha1 = source.get('sha1', None) user_agent = source.get('user-agent', None) try: if force_fallback: raise RuntimeError downloadFile(src_url, ARCHIVE_DIR, name, sha1, force_download = opt_clean_archives, user_agent = user_agent) filename_rel = os.path.basename(src_url) shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel) ) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Downloading of file " + src_url + " failed; trying fallback") p = urlparse(src_url) filename_rel = os.path.split(p.path)[1] # get original filename p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5]]) downloadFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download = True) shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel) ) else: shutil.rmtree(lib_dir) raise elif src_type == "archive": sha1 = source.get('sha1', None) user_agent = source.get('user-agent', None) try: if force_fallback: raise RuntimeError downloadAndExtractFile(src_url, ARCHIVE_DIR, name, sha1, force_download = opt_clean_archives, user_agent = user_agent) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Downloading of file " + src_url + " failed; trying fallback") p = urlparse(src_url) filename_rel = os.path.split(p.path)[1] # get original filename p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5]]) downloadAndExtractFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download = True) else: raise else: revision = source.get('revision', None) archive_name = name + ".tar.gz" # for reading or writing of snapshot archives if revision is not None: archive_name = name + "_" + revision + ".tar.gz" try: if force_fallback: raise RuntimeError cloneRepository(src_type, src_url, name, revision) if create_repo_snapshots: log("Creating snapshot of library repository " + name) repo_dir = os.path.join(SRC_DIR, name) archive_filename = os.path.join(SNAPSHOT_DIR, archive_name) dlog("Snapshot will be saved as " + archive_filename) createArchiveFromDirectory(repo_dir, archive_filename, revision is None) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Cloning of repository " + src_url + " failed; trying fallback") # copy archived snapshot from fallback location p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([p[0], p[1], p[2] + "/" + SNAPSHOT_DIR_BASE + "/" + archive_name, p[3], p[4], p[5]]) dlog("Looking for snapshot " + fallback_src_url + " of library repository " + name) # create snapshots files directory downloadAndExtractFile(fallback_src_url, SNAPSHOT_DIR, name, force_download = True) # reset repository state to particular revision (only using local operations inside the function) cloneRepository(src_type, src_url, name, revision, True) else: raise else: # set up clean directory for potential patch application shutil.rmtree(lib_dir) os.mkdir(lib_dir) # post-processing if post is not None: if 'type' not in post: log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'type'") return -1 if 'file' not in post: log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'file'") return -1 post_type = post['type'] post_file = post['file'] if post_type == "patch": applyPatchFile(post_file, name, post.get('pnum', DEFAULT_PNUM)) elif post_type == "script": runPythonScript(post_file) else: log("ERROR: Unknown post-processing type '" + post_type + "' for " + name) return -1 # add to cached state sdata.append(library) # write out cached state writeJSONData(sdata, state_filename) except: log("ERROR: Failure to bootstrap library " + name + " (reason: " + str(sys.exc_info()[0]) + ")") traceback.print_exc() failed_libraries.append(name) if failed_libraries: log("***************************************") log("FAILURE to bootstrap the following libraries:") log(', '.join(failed_libraries)) log("***************************************") return -1 log("Finished") return 0
def main(argv): global BASE_DIR, SRC_DIR, ARCHIVE_DIR, DEBUG_OUTPUT, FALLBACK_URL, USE_TAR, USE_UNZIP global TOOL_COMMAND_PYTHON, TOOL_COMMAND_GIT, TOOL_COMMAND_HG, TOOL_COMMAND_SVN, TOOL_COMMAND_PATCH, TOOL_COMMAND_TAR, TOOL_COMMAND_UNZIP try: opts, args = getopt.getopt(argv, "ln:N:cCb:h", [ "list", "name=", "name-file=", "skip=", "clean", "clean-all", "base-dir", "bootstrap-file=", "local-bootstrap-file=", "use-tar", "use-unzip", "repo-snapshots", "fallback-url=", "force-fallback", "debug-output", "help", "break-on-first-error" ]) except getopt.GetoptError: printOptions() return 0 opt_names = [] name_files = [] skip_libs = [] opt_clean = False opt_clean_archives = False list_libraries = False default_bootstrap_filename = "bootstrap.json" bootstrap_filename = os.path.abspath( os.path.join(BASE_DIR, default_bootstrap_filename)) local_bootstrap_filename = "" create_repo_snapshots = False force_fallback = False break_on_first_error = False base_dir_path = "" for opt, arg in opts: if opt in ("-h", "--help"): printOptions() return 0 if opt in ("-l", "--list"): list_libraries = True if opt in ("-n", "--name"): opt_names.append(arg) if opt in ("-N", "--name-file"): name_files.append(os.path.abspath(arg)) if opt in ("--skip", ): skip_libs.append(arg) if opt in ("-c", "--clean"): opt_clean = True if opt in ("-C", "--clean-all"): opt_clean = True opt_clean_archives = True if opt in ("-b", "--base-dir"): base_dir_path = os.path.abspath(arg) BASE_DIR = base_dir_path SRC_DIR = os.path.join(BASE_DIR, SRC_DIR_BASE) ARCHIVE_DIR = os.path.join(BASE_DIR, ARCHIVE_DIR_BASE) bootstrap_filename = os.path.join(BASE_DIR, default_bootstrap_filename) log("Using " + arg + " as base directory") if opt in ("--bootstrap-file", ): bootstrap_filename = os.path.abspath(arg) log("Using main bootstrap file " + bootstrap_filename) if opt in ("--local-bootstrap-file", ): local_bootstrap_filename = os.path.abspath(arg) log("Using local bootstrap file " + local_bootstrap_filename) if opt in ("--use-tar", ): USE_TAR = True if opt in ("--use-unzip", ): USE_UNZIP = True if opt in ("--repo-snapshots", ): create_repo_snapshots = True log("Will create repository snapshots") if opt in ("--fallback-url", ): FALLBACK_URL = arg if opt in ("--force-fallback", ): force_fallback = True log("Using fallback URL to fetch all libraries") if opt in ("--break-on-first-error", ): break_on_first_error = True if opt in ("--debug-output", ): DEBUG_OUTPUT = True if platform.system() != "Windows": # Unfortunately some IDEs do not have a proper PATH environment variable set, # so we search manually for the required tools in some obvious locations. paths_to_search = os.environ["PATH"].split(":") + [ "/usr/local/bin", "/opt/local/bin", "/usr/bin" ] TOOL_COMMAND_PYTHON = findToolCommand(TOOL_COMMAND_PYTHON, paths_to_search, required=True) TOOL_COMMAND_GIT = findToolCommand(TOOL_COMMAND_GIT, paths_to_search, required=True) TOOL_COMMAND_HG = findToolCommand(TOOL_COMMAND_HG, paths_to_search, required=True) TOOL_COMMAND_SVN = findToolCommand(TOOL_COMMAND_SVN, paths_to_search, required=True) TOOL_COMMAND_PATCH = findToolCommand(TOOL_COMMAND_PATCH, paths_to_search, required=True) TOOL_COMMAND_TAR = findToolCommand(TOOL_COMMAND_TAR, paths_to_search, required=USE_TAR) TOOL_COMMAND_UNZIP = findToolCommand(TOOL_COMMAND_UNZIP, paths_to_search, required=USE_UNZIP) if base_dir_path: os.chdir(base_dir_path) if name_files: for name_file in name_files: try: with open(name_file) as f: opt_names_local = [ l for l in (line.strip() for line in f) if l ] opt_names_local = [ l for l in opt_names_local if l[0] != '#' ] opt_names += opt_names_local dlog("Name file contains: " + ", ".join(opt_names_local)) except: log("ERROR: cannot parse name file " + name_file) return -1 if force_fallback and not FALLBACK_URL: log("Error: cannot force usage of the fallback location without specifying a fallback URL" ) return -1 state_filename = os.path.join(os.path.dirname(os.path.splitext(bootstrap_filename)[0]), \ "." + os.path.basename(os.path.splitext(bootstrap_filename)[0])) \ + os.path.splitext(bootstrap_filename)[1] dlog("bootstrap_filename = " + bootstrap_filename) dlog("state_filename = " + state_filename) # read canonical libraries data data = readJSONData(bootstrap_filename) if data is None: return -1 # some sanity checking for library in data: if library.get('name', None) is None: log("ERROR: Invalid schema: library object does not have a 'name'") return -1 # read local libraries data, if available local_data = None if local_bootstrap_filename: local_data = readJSONData(local_bootstrap_filename) if local_data is None: return -1 # some sanity checking for local_library in local_data: if local_library.get('name', None) is None: log("ERROR: Invalid schema: local library object does not have a 'name'" ) return -1 # merge canonical and local library data, if applicable; local libraries take precedence if local_data is not None: for local_library in local_data: local_name = local_library.get('name', None) found_canonical_library = False for n, library in enumerate(data): name = library.get('name', None) if local_name == name: data[n] = local_library # overwrite library found_canonical_library = True if not found_canonical_library: data.append(local_library) if list_libraries: listLibraries(data) return 0 sdata = [] if os.path.exists(state_filename): sdata = readJSONData(state_filename) # create source directory if not os.path.isdir(SRC_DIR): log("Creating directory " + SRC_DIR) os.mkdir(SRC_DIR) # create archive files directory if not os.path.isdir(ARCHIVE_DIR): log("Creating directory " + ARCHIVE_DIR) os.mkdir(ARCHIVE_DIR) failed_libraries = [] for library in data: name = library.get('name', None) source = library.get('source', None) post = library.get('postprocess', None) if (skip_libs) and (name in skip_libs): continue if (opt_names) and (not name in opt_names): continue lib_dir = os.path.join(SRC_DIR, name) lib_dir = lib_dir.replace(os.path.sep, '/') dlog("********** LIBRARY " + name + " **********") dlog("lib_dir = " + lib_dir + ")") # compare against cached state cached_state_ok = False if not opt_clean: for slibrary in sdata: sname = slibrary.get('name', None) if sname is not None and sname == name and slibrary == library and os.path.exists( lib_dir): cached_state_ok = True break if cached_state_ok: log("Cached state for " + name + " equals expected state; skipping library") continue else: # remove cached state for library sdata[:] = [ s for s in sdata if not (lambda s, name: s.get( 'name', None) is not None and s['name'] == name)(s, name) ] # create library directory, if necessary if opt_clean: log("Cleaning directory for " + name) if os.path.exists(lib_dir): shutil.rmtree(lib_dir) if not os.path.exists(lib_dir): os.makedirs(lib_dir) try: # download source if source is not None: if 'type' not in source: log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'type'") return -1 if 'url' not in source: log("ERROR: Invalid schema for " + name + ": 'source' object must have a 'url'") return -1 src_type = source['type'] src_url = source['url'] if src_type == "sourcefile": sha1 = source.get('sha1', None) user_agent = source.get('user-agent', None) try: if force_fallback: raise RuntimeError downloadFile(src_url, ARCHIVE_DIR, name, sha1, force_download=opt_clean_archives, user_agent=user_agent) filename_rel = os.path.basename(src_url) shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel)) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Downloading of file " + src_url + " failed; trying fallback") p = urlparse(src_url) filename_rel = os.path.split( p.path)[1] # get original filename p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([ p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5] ]) downloadFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download=True) shutil.copyfile( os.path.join(ARCHIVE_DIR, filename_rel), os.path.join(lib_dir, filename_rel)) else: shutil.rmtree(lib_dir) raise elif src_type == "archive": sha1 = source.get('sha1', None) user_agent = source.get('user-agent', None) try: if force_fallback: raise RuntimeError downloadAndExtractFile( src_url, ARCHIVE_DIR, name, sha1, force_download=opt_clean_archives, user_agent=user_agent) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Downloading of file " + src_url + " failed; trying fallback") p = urlparse(src_url) filename_rel = os.path.split( p.path)[1] # get original filename p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([ p[0], p[1], p[2] + "/" + ARCHIVE_DIR_BASE + "/" + filename_rel, p[3], p[4], p[5] ]) downloadAndExtractFile(fallback_src_url, ARCHIVE_DIR, name, sha1, force_download=True) else: raise else: revision = source.get('revision', None) archive_name = name + ".tar.gz" # for reading or writing of snapshot archives if revision is not None: archive_name = name + "_" + revision + ".tar.gz" try: if force_fallback: raise RuntimeError cloneRepository(src_type, src_url, name, revision) if create_repo_snapshots: log("Creating snapshot of library repository " + name) repo_dir = os.path.join(SRC_DIR, name) archive_filename = os.path.join( SNAPSHOT_DIR, archive_name) dlog("Snapshot will be saved as " + archive_filename) createArchiveFromDirectory(repo_dir, archive_filename, revision is None) except: if FALLBACK_URL: if not force_fallback: log("WARNING: Cloning of repository " + src_url + " failed; trying fallback") # copy archived snapshot from fallback location p = urlparse(FALLBACK_URL) fallback_src_url = urlunparse([ p[0], p[1], p[2] + "/" + SNAPSHOT_DIR_BASE + "/" + archive_name, p[3], p[4], p[5] ]) dlog("Looking for snapshot " + fallback_src_url + " of library repository " + name) # create snapshots files directory downloadAndExtractFile(fallback_src_url, SNAPSHOT_DIR, name, force_download=True) # reset repository state to particular revision (only using local operations inside the function) cloneRepository(src_type, src_url, name, revision, True) else: raise else: # set up clean directory for potential patch application shutil.rmtree(lib_dir) os.mkdir(lib_dir) # post-processing if post is not None: if 'type' not in post: log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'type'") return -1 if 'file' not in post: log("ERROR: Invalid schema for " + name + ": 'postprocess' object must have a 'file'") return -1 post_type = post['type'] post_file = post['file'] if post_type == "patch": applyPatchFile(post_file, name, post.get('pnum', DEFAULT_PNUM)) elif post_type == "script": runPythonScript(post_file) else: log("ERROR: Unknown post-processing type '" + post_type + "' for " + name) return -1 # add to cached state sdata.append(library) # write out cached state writeJSONData(sdata, state_filename) except: log("ERROR: Failure to bootstrap library " + name + " (reason: " + str(sys.exc_info()[0]) + ")") if break_on_first_error: exit(-1) traceback.print_exc() failed_libraries.append(name) if failed_libraries: log("***************************************") log("FAILURE to bootstrap the following libraries:") log(', '.join(failed_libraries)) log("***************************************") return -1 log("Finished") return 0
def get_epg_from_receiver(self, provider, url): # reduce the pids to the ones containing SDT (0x11) and EIT (0x12) url_st = urlparse(url) queries = url_st.query new_queries = "" if queries: for eq in queries.split("&"): key = eq.split("=")[0] value = eq.split("=")[1] if key == 'pids': value = "0,17,18" new_queries += key + "=" + value + "&" new_queries = new_queries.strip("&") url_epd_pids_only = urlunparse(( url_st.scheme, url_st.netloc, url_st.path, url_st.params, new_queries, url_st.fragment, )) attr = [ os.path.join(self.origin_dir, 'epg_grap.sh'), url_epd_pids_only, provider, str(self.config.read('epgloops')), str(self.config.read('epgtimeout')) ] # process arguments self.logger.info("epg_grap started {0} {1} {2}".format( provider, url_epd_pids_only, repr(attr))) try: self.process = subprocess.Popen(attr, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cleaner = Timer( 600, self.cleanProcess ) # if epg_grap won't exit, try to terminate its process in 30 seconds cleaner.start() epg_out, err = self.process.communicate() #self.process.wait() # oops... not needed? harmless! cleaner.cancel() if err: self.logger.warning("epg_grap ended with an error:\n%s" % (err)) else: self.logger.debug("epg_grap' ended") epg_json_string = epg_out.decode() epg_json = json.loads(epg_json_string) result = {} count = 0 for json_movie in epg_json['details'].values(): start = json_movie['unixTimeBegin'] stop = json_movie['unixTimeEnd'] if json_movie['title']: title = self.split_text_by_capital_chars( json_movie['title'])[0] else: title = json_movie['name'] desc = '\n'.join( self.split_text_by_capital_chars( json_movie['description'])) category = json_movie['name'] count += 1 # we'll use the name of the stream source plugin instead the name of the EPG plugin itself # plugin_name = self.plugin_names[0] plugin_name = self.stream_source self.providers.add(provider) # EPG has its own special hardwired categories #self.categories.add(category) new_movie = MovieInfo( url=url, mime='video/MP2T', title=title, category=category, source=plugin_name, source_type=defaults.MOVIE_TYPE_STREAM, provider=provider, timestamp=int(start), duration=stop - start, description=desc) if not plugin_name in self.movies: self.movies[plugin_name] = {} self.movies[plugin_name][new_movie['uri']] = new_movie result[start] = new_movie for json_provider in epg_json['providers']: self.logger.debug( "channel found in epg: {0}".format(json_provider)) self.logger.info("{0} epg loaded, {1} entries".format( provider, count)) return result except Exception as ex: self.logger.warning("epg_grap could not be started. Error: %s" % (ex)) return
def http_error_302(self, req, fp, code, msg, headers): # Some servers (incorrectly) return multiple Location headers # (so probably same goes for URI). Use first header. if "location" in headers: newurl = headers["location"] elif "uri" in headers: newurl = headers["uri"] else: return if newurl.startswith('training.phtml'): newurl = '/island/' + newurl elif not newurl.startswith('/'): newurl = '/' + newurl # fix relative URL newurl = 'http://www.neopets.com' + newurl # fix a possible malformed URL urlparts = urlparse(newurl) # For security reasons we don't allow redirection to anything other # than http, https or ftp. if not urlparts.scheme in ('http', 'https', 'ftp'): raise HTTPError(newurl, code, msg + " - Redirection to url '%s' is not allowed" % newurl, headers, fp) if not urlparts.path: urlparts = list(urlparts) urlparts[2] = "/" newurl = urlunparse(urlparts) newurl = urljoin(req.full_url, newurl) # XXX Probably want to forget about the state of the current # request, although that might interact poorly with other # handlers that also use handler-specific request attributes new = self.redirect_request(req, fp, code, msg, headers, newurl) if new is None: return # loop detection # .redirect_dict has a key url if url was previously visited. if hasattr(req, 'redirect_dict'): visited = new.redirect_dict = req.redirect_dict if (visited.get(newurl, 0) >= self.max_repeats or len(visited) >= self.max_redirections): raise HTTPError(req.full_url, code, self.inf_msg + msg, headers, fp) else: visited = new.redirect_dict = req.redirect_dict = {} visited[newurl] = visited.get(newurl, 0) + 1 # Don't close the fp until we are sure that we won't use it # with HTTPError. fp.read() fp.close() return self.parent.open(new, timeout=req.timeout)