def tearDown(self): fs = hdfs.hdfs("", 0) fs.delete(self.local_wd) fs.close() fs = hdfs.hdfs("default", 0) fs.delete(self.hdfs_wd) fs.close()
def write(writeFlag): if (writeFlag == True): # instantiate hadoop hdfs.hdfs() targetPath = config.targetPath; targetDirectory = config.targetDirectory; sourceFile = config.sourceFile; print("Target Path: " + targetPath); print("Target Directory: " + targetDirectory); print("Source Path: " + sourceFile); dumpFile = open(sourceFile, "r"); fullText = dumpFile.read(); dumpFile.close(); # write to hadoop #hdfs.mkdir(targetDirectory) hdfs.dump(fullText, targetPath) #hdfs.cp(sourceFile, targetPath) #print (hdfs.ls("test4")) #files = hdfs.ls("test4") # read from hadoop #hdfs.get("test4/hello.txt", "/tmp/hello.txt") #with open("/tmp/hello.txt") as f: # print f.read() #print(hdfs.ls("test", "hduser1")) #text = hdfs.load("test/hello.txt") #print text
def capacity(self): fs = hdfs.hdfs("", 0) self.assertRaises(RuntimeError, fs.capacity) fs.close() if not hdfs.default_is_local(): fs = hdfs.hdfs("default", 0) cap = fs.capacity() self.assertGreaterEqual(cap, 0)
def cache(self): orig_fs = hdfs.hdfs(*self.hp_cases[0]) for host, port in self.hp_cases[1:]: fs = hdfs.hdfs(host, port) self.assertTrue(fs.fs is orig_fs.fs) fs.close() self.assertFalse(orig_fs.closed) orig_fs.close() self.assertTrue(orig_fs.closed)
def cache(self): hdfs.hdfs._CACHE.clear() orig_fs = hdfs.hdfs(*self.hp_cases[0]) for host, port in self.hp_cases[1:]: fs = hdfs.hdfs(host, port) self.assertTrue(fs.fs is orig_fs.fs) fs.close() self.assertFalse(orig_fs.closed) orig_fs.close() self.assertTrue(orig_fs.closed)
def _hdfs_filesystem(): """Retrieve references to the local and HDFS file system. Need to be able to specify host/port. For now, works off defaults. """ fs = hdfs("default", 0) lfs = hdfs("", 0) try: yield fs, lfs finally: fs.close() lfs.close()
def cache(self): for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2): hdfs.hdfs._CACHE.clear() hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}} # FIXME with hdfs.hdfs(h1, p1) as fs1: with hdfs.hdfs(h2, p2) as fs2: print ' * %r vs %r' % ((h1, p1), (h2, p2)) self.assertTrue(fs2.fs is fs1.fs) for fs in fs1, fs2: self.assertFalse(fs.closed) for fs in fs1, fs2: self.assertTrue(fs.closed)
def copyFileToHDFSFolder(localpath, hdfspath): """ Copies a file from a local or HDFS to an HDFS location :param localpath: path to local file :param hdfspath: path to target file on HDFS :return: None """ if localpath.startswith('file:/'): lf = H.hdfs("", 0) else: lf = H.hdfs() h = H.hdfs() lf.copy(localpath, h, hdfspath)
def read(readFlag): print(readFlag); if (readFlag == True): targetFile = config.targetFile.strip() targetDirectory = config.targetDirectory.strip() targetPath = config.targetPath print(targetPath) # instantiate hadoop hdfs.hdfs() # read from hadoop fileToRead = hdfs.open(targetPath) print(fileToRead.read())
def save_checkpoint(path, session=None): if session is None: session = tf.get_default_session() if session is None: raise RuntimeError("no session specified and no current session") saver = tf.train.Saver() wd = tempfile.mkdtemp(prefix="pydeep_") sub_d = hdfs.path.splitext(hdfs.path.basename(path))[0] abs_d = os.path.join(wd, sub_d) os.makedirs(abs_d) saver.save(session, os.path.join(abs_d, Model.CHECKPOINT_NAME)) zip_fn = "%s.zip" % abs_d shutil.make_archive(*zip_fn.rsplit(".", 1), root_dir=abs_d) with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs: local_fs.copy(zip_fn, fs, path)
def stat_on_local(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) p_ = os.path.join(wd_, make_random_str()) if hdfs.default_is_local(): wd, p = wd_, p_ host = "default" else: wd, p = ('file:%s' % _ for _ in (wd_, p_)) host = "" fs = hdfs.hdfs(host, 0) with fs.open_file(p_, 'w') as fo: fo.write(make_random_str()) info = fs.get_path_info(p_) fs.close() s = hdfs.path.stat(p) os_s = os.stat(p_) for n in dir(s): if n.startswith('st_'): try: exp_v = getattr(os_s, n) except AttributeError: try: exp_v = info[self.NMAP[n]] except KeyError: continue self.assertEqual(getattr(s, n), exp_v) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rmr(wd)
def run(self): if self.options is None: raise RuntimeError("You must call parse_cmd_line before run") if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug("Running Seqal") self.logger.debug("Properties:\n%s", "\n".join( sorted([ "%s = %s" % (str(k), str(v)) for k,v in self.properties.iteritems() ]) )) self.logger.info("Input: %s; Output: %s; reference: %s", self.options.input, self.options.output, self.options.reference) try: self.hdfs = phdfs.hdfs('default', 0) self.__validate() self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.', suffix=str(random.random()), dir='') try: with self.hdfs.open_file(self.remote_bin_name, 'w') as script: self.__write_pipes_script(script) full_name = self.hdfs.get_path_info(self.remote_bin_name)['name'] return seal_utilities.run_pipes(full_name, self.options.input, self.options.output, properties=self.properties, args_list=self.left_over_args) finally: try: self.hdfs.delete(self.remote_bin_name) # delete the temporary pipes script from HDFS self.logger.debug("pipes script %s deleted", self.remote_bin_name) except: self.logger.error("Error deleting the temporary pipes script %s from HDFS", self.remote_bin_name) ## don't re-raise the exception. We're on our way out finally: if self.hdfs: tmp = self.hdfs self.hdfs = None tmp.close() self.logger.debug("HDFS closed")
def connect(self): for host, port in self.hp_cases: for user in self.u_cases: expected_user = user or CURRENT_USER fs = hdfs.hdfs(host, port, user=user) self.assertEqual(fs.user, expected_user) fs.close()
def build_map(self, top_dir): """\ For each subdir (corresponding to an image class), build the full list of (filename, offset) pair where each bottleneck dump can be retrieved. {'dandelion': [ ('part-m-00000', 0), ('part-m-00000', 8192), ... ('part-m-00003', 163840) ], 'roses': [ ('part-m-00000', 0), ... ]} """ m = {} basename = hdfs.path.basename with hdfs.hdfs() as fs: for stat in fs.list_directory(top_dir): if stat['kind'] != 'directory': continue subd = stat['name'] positions = [] for s in fs.list_directory(subd): bname = basename(s["name"]) if bname.startswith("_"): continue assert s["size"] % self.record_size == 0 for i in range(0, s["size"], self.record_size): positions.append((bname, i)) m[basename(subd)] = positions return m
def __create_data_file(self): host, port, path = split_hdfs_path(self.data_file_name) fs = hdfs(host, port) f = fs.open_file(path, os.O_WRONLY, 0, 0, 0) f.write(self.f.getvalue()) f.close() fs.close()
class StorageHandler: hdfsobj = hdfs.hdfs() def __init__(self, host, port): self.hdfsobj = hdfs.hdfs(host, port, user="******", groups=["vagrant"]) def pwd(self): return self.hdfsobj.working_directory() def listDirectory(self, path="/"): return self.hdfsobj.list_directory(path) def delete(self, path): self.hdfsobj.delete(path, False) def put(self, source, destination): hdfs.put(source, destination) def copyFile(self, source, destination): self.hdfsobj.copy(source, self.hdfsobj, destination) def write(self, path, mod, data): with hdfs.open(path, mod) as f: f.write(data)
def __init__(self, tableName, host='192.168.60.64', infoTable='runInfo'): self.tableName = tableName self.conn = happybase.Connection(host) self.table = self.conn.table(infoTable) self.eventdb = self.conn.table('HEP:' + tableName) self.escape = escape() self.fs = hdfs.hdfs(host=host, port=8022, user='******')
def setUp(self): if hdfs.default_is_local(): self.root = "file:" else: fs = hdfs.hdfs("default", 0) self.root = "hdfs://%s:%s" % (fs.host, fs.port) fs.close()
def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path): """ Check directories above the remote module and issue a warning if they are not traversable by all users. The reasoning behind this is mainly aimed at set-ups with a centralized Hadoop cluster, accessed by all users, and where the Hadoop task tracker user is not a superuser; an example may be if you're running a shared Hadoop without HDFS (using only a POSIX shared file system). The task tracker correctly changes user to the job requester's user for most operations, but not when initializing the distributed cache, so jobs who want to place files not accessible by the Hadoop user into dist cache fail. """ host, port, path = hdfs.path.split(abs_remote_path) if host == '' and port == 0: # local file system host_port = "file:///" else: # FIXME: this won't work with any scheme other than hdfs:// (e.g., s3) host_port = "hdfs://%s:%s/" % (host, port) path_pieces = path.strip('/').split(os.path.sep) fs = hdfs.hdfs(host, port) for i in xrange(0, len(path_pieces)): part = os.path.join(host_port, os.path.sep.join(path_pieces[0:i+1])) permissions = fs.get_path_info(part)['permissions'] if permissions & 0111 != 0111: self.logger.warning( "the remote module %s may not be readable\n" + "by the task tracker when initializing the distributed cache.\n" + "Permissions on path %s: %s", abs_remote_path, part, oct(permissions)) break
def get(): """ Get a handle to pydoop hdfs using the default namenode (specified in hadoop config) Returns: Pydoop hdfs handle """ return hdfs.hdfs('default', 0, user=project_user())
def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path): """ Check directories above the remote module and issue a warning if they are not traversable by all users. The reasoning behind this is mainly aimed at set-ups with a centralized Hadoop cluster, accessed by all users, and where the Hadoop task tracker user is not a superuser; an example may be if you're running a shared Hadoop without HDFS (using only a POSIX shared file system). The task tracker correctly changes user to the job requester's user for most operations, but not when initializing the distributed cache, so jobs who want to place files not accessible by the Hadoop user into dist cache fail. """ host, port, path = hdfs.path.split(abs_remote_path) if host == '' and port == 0: # local file system host_port = "file:///" else: # FIXME: this won't work with any scheme other than # hdfs:// (e.g., s3) host_port = "hdfs://%s:%s/" % (host, port) path_pieces = path.strip('/').split(os.path.sep) fs = hdfs.hdfs(host, port) for i in range(0, len(path_pieces)): part = os.path.join(host_port, os.path.sep.join(path_pieces[0:i + 1])) permissions = fs.get_path_info(part)['permissions'] if permissions & 0o111 != 0o111: self.logger.warning( ("remote module %s may not be readable by the task " "tracker when initializing the distributed cache. " "Permissions on %s: %s"), abs_remote_path, part, oct(permissions)) break
def load_checkpoint(path, session=None): if session is None: session = tf.get_default_session() if session is None: raise RuntimeError("no session specified and no current session") wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, hdfs.path.basename(path)) with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs: fs.copy(path, local_fs, zip_fn) unpack_dir = os.path.splitext(zip_fn)[0] shutil.unpack_archive(zip_fn, unpack_dir) ckpt_path = os.path.join(unpack_dir, Model.CHECKPOINT_NAME) metagraph_path = "%s.meta" % ckpt_path if not os.path.isfile(metagraph_path): raise RuntimeError("checkpoint files not found in %s" % zip_fn) saver = tf.train.import_meta_graph(metagraph_path) saver.restore(session, ckpt_path)
def __init__(self, context): super(WholeFileReader, self).__init__(context) self.logger = LOGGER.getChild("WholeFileReader") raw_split = context.get_input_split(raw=True) self.isplit = OpaqueInputSplit().read(io.BytesIO(raw_split)) self.paths = self.isplit.payload self.n_paths = len(self.paths) self.fs = hdfs.hdfs()
def clearLocatie(): """ Removes the locatie parquet table, if it exists :return: None """ h = hdfs() if h.exists(LOCATIE): h.delete(LOCATIE)
def clearTelling(): """ Removes the telling parquet table, if it exists :return: None """ h = hdfs() if h.exists(TELLING): h.delete(TELLING)
def get_res(output_dir): fs = hdfs() data = [] for x in fs.list_directory(output_dir): if os.path.split(x['path'])[-1].startswith('part-'): with fs.open_file(x['path']) as f: data.append(f.read()) all_data = ''.join(data) return pts.parse_mr_output(all_data, vtype=int)
def get_res(output_dir): fs = hdfs() data = [] for x in fs.list_directory(output_dir): if os.path.split(x['path'])[-1].startswith('part-'): with fs.open_file(x['path'], 'rt') as f: data.append(f.read()) all_data = ''.join(data) return pts.parse_mr_output(all_data, vtype=int)
def __init__(self, context): super(FastaReader, self).__init__() self.logger = logging.getLogger(self.__class__.__name__) self.isplit = InputSplit(context.getInputSplit()) self.host, self.port, self.fpath = split_hdfs_path(self.isplit.filename) self.fs = hdfs(self.host, self.port) self.file = self.fs.open_file(self.fpath, os.O_RDONLY) self._iterator = (SeqIO.parse(self.file, "fasta") if self.isplit.offset == 0 else None)
def compute_vc(input_dir): fs = hdfs() data = [] for x in fs.list_directory(input_dir): with fs.open_file(x['path']) as f: data.append(f.read()) all_data = ''.join(data) vowels = re.findall('[AEIOUY]', all_data.upper()) return Counter(vowels)
def __init__(self, context): super(FastaReader, self).__init__() self.logger = logging.getLogger(self.__class__.__name__) self.isplit = InputSplit(context.getInputSplit()) self.host, self.port, self.fpath = split_hdfs_path( self.isplit.filename) self.fs = hdfs(self.host, self.port) self.file = self.fs.open_file(self.fpath, os.O_RDONLY) self._iterator = (SeqIO.parse(self.file, "fasta") if self.isplit.offset == 0 else None)
def runTest(self): current_user = getpass.getuser() cwd = os.getcwd() os.chdir(tempfile.gettempdir()) for user in None, current_user, "nobody": expected_user = current_user fs = hdfs.hdfs("", 0, user=user) self.assertEqual(fs.user, expected_user) fs.close() os.chdir(cwd)
def clearResults(name=""): """ Clears target result parquet table name, or all result parquet tables if no name is given :param name: the target result parquet table name :return: None """ p = path.join(RESULT_DIR, name) h = hdfs() if h.exists(p): h.delete(p)
def main(directory, topic, byline): #get a hdfs object myHdfs = hdfs.hdfs() myPath = myHdfs.walk(directory) # a global variable global producer # Get a producer object producer = KafkaProducer(bootstrap_servers=["node4:6667"], compression_type='gzip', acks=1, retries=2) for myfile in myPath: #Skip directory recursive if myfile["kind"] == "directory": logger.debug("ignoring %s" % (myfile)) continue elif myfile["kind"] == "file": pass else: raise Exception, "Unknown kind %s for %s" % (myfile["kind"], myfile["name"]) #Skip name in particoular if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]: logger.debug("ignoring %s" % (myfile)) continue #Skip 0 size files if myfile["size"] == 0: logger.debug("ignoring %s" % (myfile)) continue logger.info("Working on %s" % (myfile["name"])) #call processChunk if I want to submit chunk if byline is False: processChunk(myfile, topic) else: #Otherwise submit line by line processLine(myfile, topic) #with file open logger.info("Completed %s" % (myfile["name"])) #sleep some time time.sleep(1) # for all files in HDFS producer.close()
def get_hosts(self): if hdfs.default_is_local(): # only run on HDFS return hdfs.dump(self.data, self.hdfs_paths[0], mode="wb") fs = hdfs.hdfs("default", 0) hs = fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertTrue(len(hs) > 0) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
def main(): fs = hdfs.hdfs() try: root = "%s/%s" % (fs.working_directory(), TEST_ROOT) if not isdir(fs, root): sys.exit("%r does not exist" % root) print "BS(MB)\tBYTES" for k, v in usage_by_bs(fs, root).iteritems(): print "%.1f\t%d" % (k / float(MB), v) finally: fs.close()
def Get_stock_ticks(code, time_to_market): import tushare as ts import pandas as pd import logging import datetime as dt import os import socket import pydoop.hdfs as hdfs import shutil if time_to_market != 0: logger = logging.getLogger("D_stock") logger_handler = logging.FileHandler("/tmp/D_stock.log") logger_handler.setFormatter( logging.Formatter("%(asctime)s -- %(message)s")) logger_handler.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) logger.addHandler(logger_handler) logger.info(">" * 15 + code + ">" * 15) all_days = pd.date_range(start=str(time_to_market), end=dt.date.today(), freq="B") all_days = [x.date() for x in all_days] for day in all_days[::-1]: logger.info("Saving " + code + "@" + str(day) + "...") while True: try: df = ts.get_tick_data(code, date=day) except Exception as e: print e continue break if df.index.size > 3: dir_name = "/tmp/ticks/" + str(code) if not os.path.exists(dir_name): os.makedirs(dir_name) file_name = dir_name + "/" + str(day) + ".csv" df.to_csv(file_name) """ Write to HDFS """ if os.path.exists(dir_name): s = hdfs.hdfs(host="spark-1", port=9000) if not s.exists("ticks"): s.create_directory("ticks") hdfs.put(dir_name, "./ticks/") shutil.rmtree(dir_name) logger.info("<" * 15 + code + "<" * 15) return (socket.gethostname(), code)
def get_hosts(self): if hdfs.default_is_local(): # only run on HDFS return hdfs.dump(self.data, self.hdfs_paths[0]) fs = hdfs.hdfs("default", 0) hs = fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertTrue(len(hs) > 0) self.assertRaises( ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10 ) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
def run(self): if self.options is None: raise RuntimeError("You must call parse_cmd_line before run") if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug("Running Seqal") self.logger.debug( "Properties:\n%s", "\n".join( sorted([ "%s = %s" % (str(k), str(v)) for k, v in self.properties.iteritems() ]))) self.logger.info("Input: %s; Output: %s; reference: %s", self.options.input, self.options.output, self.options.reference) try: self.hdfs = phdfs.hdfs('default', 0) self.__validate() self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.', suffix=str(random.random()), dir='') try: with self.hdfs.open_file(self.remote_bin_name, 'w') as script: self.__write_pipes_script(script) full_name = self.hdfs.get_path_info( self.remote_bin_name)['name'] return hadut.run_pipes(full_name, self.options.input, self.options.output, properties=self.properties, args_list=self.left_over_args) finally: try: self.hdfs.delete( self.remote_bin_name ) # delete the temporary pipes script from HDFS self.logger.debug("pipes script %s deleted", self.remote_bin_name) except: self.logger.error( "Error deleting the temporary pipes script %s from HDFS", self.remote_bin_name) ## don't re-raise the exception. We're on our way out finally: if self.hdfs: tmp = self.hdfs self.hdfs = None tmp.close() self.logger.debug("HDFS closed")
def main(args): host, port, out_dir = hdfs.path.split(args.out_dir) fs = hdfs.hdfs(host, port) fs.create_directory(out_dir) join = os.path.join for dt, path in get_images(args.in_dir): out_path = join(out_dir, f"{dt.strftime(OUT_FMT)}.png") if not args.overwrite and fs.exists(out_path): continue with io.open(path, "rb") as fi: with fs.open_file(out_path, "wb") as fo: fo.write(fi.read())
def copy(self): local_fs = hdfs.hdfs('', 0) local_wd = make_wd(local_fs) from_path = os.path.join(local_wd, uuid.uuid4().hex) content = uuid.uuid4().hex with open(from_path, "w") as f: f.write(content) to_path = self._make_random_file() local_fs.copy(from_path, self.fs, to_path) local_fs.close() with self.fs.open_file(to_path) as f: self.assertEqual(f.read(), content) shutil.rmtree(local_wd)
def main(directory, topic, byline): #get a hdfs object myHdfs = hdfs.hdfs() myPath = myHdfs.walk(directory) # a global variable global producer # Get a producer object producer = KafkaProducer(bootstrap_servers=["node4:6667"], compression_type='gzip', acks=1, retries=2) for myfile in myPath: #Skip directory recursive if myfile["kind"] == "directory": logger.debug("ignoring %s" %(myfile)) continue elif myfile["kind"] == "file": pass else: raise Exception, "Unknown kind %s for %s" %(myfile["kind"], myfile["name"]) #Skip name in particoular if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]: logger.debug("ignoring %s" %(myfile)) continue #Skip 0 size files if myfile["size"] == 0: logger.debug("ignoring %s" %(myfile)) continue logger.info("Working on %s" %(myfile["name"])) #call processChunk if I want to submit chunk if byline is False: processChunk(myfile, topic) else: #Otherwise submit line by line processLine(myfile, topic) #with file open logger.info("Completed %s" %(myfile["name"])) #sleep some time time.sleep(1) # for all files in HDFS producer.close()
def Get_stock_ticks(code, time_to_market): import tushare as ts import pandas as pd import logging import datetime as dt import os import socket import pydoop.hdfs as hdfs import shutil if time_to_market !=0: logger = logging.getLogger("D_stock") logger_handler=logging.FileHandler("/tmp/D_stock.log") logger_handler.setFormatter(logging.Formatter("%(asctime)s -- %(message)s")) logger_handler.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG) logger.addHandler(logger_handler) logger.info(">"*15+code+">"*15) all_days=pd.date_range(start=str(time_to_market),end=dt.date.today(),freq="B") all_days=[x.date() for x in all_days] for day in all_days[::-1]: logger.info("Saving "+code+"@"+str(day)+"...") while True: try: df=ts.get_tick_data(code,date=day) except Exception as e: print e continue break if df.index.size >3: dir_name="/tmp/ticks/"+str(code) if not os.path.exists(dir_name): os.makedirs(dir_name) file_name=dir_name+"/"+str(day)+".csv" df.to_csv(file_name) """ Write to HDFS """ if os.path.exists(dir_name): s=hdfs.hdfs(host="spark-1",port=9000) if not s.exists("ticks"): s.create_directory("ticks") hdfs.put(dir_name,"./ticks/") shutil.rmtree(dir_name) logger.info("<"*15+code+"<"*15) return (socket.gethostname(),code)
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--conf-dir", metavar="HADOOP_CONF_DIR") args = parser.parse_args(argv) if args.conf_dir: os.environ["HADOOP_CONF_DIR"] = os.path.abspath(args.conf_dir) hdfs.reset() fs = hdfs.hdfs() print "--- OPEN ---" dump_status(fs) print "cwd:", fs.working_directory() print fs.close() print "--- CLOSED ---" dump_status(fs)
def setUp(self): wd = tempfile.mkdtemp() wd_bn = os.path.basename(wd) self.local_wd = "file:%s" % wd fs = hdfs.hdfs("default", 0) fs.create_directory(wd_bn) self.hdfs_wd = fs.get_path_info(wd_bn)["name"] fs.close() basenames = ["test_path_%d" % i for i in xrange(2)] self.local_paths = ["%s/%s" % (self.local_wd, bn) for bn in basenames] self.hdfs_paths = ["%s/%s" % (self.hdfs_wd, bn) for bn in basenames] self.data = make_random_data(4*BUFSIZE + BUFSIZE/2) for path in self.local_paths: self.assertTrue(path.startswith("file:")) for path in self.hdfs_paths: if not hdfs.default_is_local(): self.assertTrue(path.startswith("hdfs:"))
def stat(self): if hdfs.default_is_local(): return bn = '%s%s' % (make_random_str(), UNI_CHR) fn = '/user/%s/%s' % (DEFAULT_USER, bn) fs = hdfs.hdfs("default", 0) p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn) with fs.open_file(fn, 'w') as fo: fo.write(make_random_str()) info = fs.get_path_info(fn) fs.close() s = hdfs.path.stat(p) for n1, n2 in self.NMAP.iteritems(): attr = getattr(s, n1, None) self.assertFalse(attr is None) self.assertEqual(attr, info[n2]) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rmr(p)
def _clean_up_bcl_output(output_dir): """ Delete prq files with no data """ host, port, _ = phdfs.path.split(output_dir) fs = phdfs.hdfs(host, port) count = 0 for item in fs.walk(output_dir): if item['kind'] == 'file' and item['name'].endswith('.gz') and item['size'] < 30: if not item['name'].startswith('hdfs://'): raise RuntimeError("Insanity! Tring to delete %s!", item['name']) fs.delete(item['name'], recursive=False) count += 1 logger.info("Removed %d empty files from bcl output", count) undet_path = os.path.join(output_dir, 'Undetermined') if phdfs.path.exists(undet_path): logger.info("Removing reads from Undetermined dataset %s", undet_path) fs.delete(undet_path)
def main(argv): try: depth = int(argv[1]) span = int(argv[2]) except IndexError: print "Usage: python %s DEPTH SPAN" % argv[0] sys.exit(2) fs = hdfs.hdfs() try: root = "%s/%s" % (fs.working_directory(), TEST_ROOT) try: fs.delete(root) except IOError: pass fs.create_directory(root) treegen(fs, root, depth, span) finally: fs.close()
def rename_compressed_files(self, file_table): # find the extension output_files = hdfs.ls(self.output_path) if len(output_files) == 0: return compressor_extension = self.get_compressor_extension(output_files) self.log.debug("compressor extension is %s", compressor_extension) hdfs_host, hdfs_port, _ = hdfs.path.split(output_files[0]) if hdfs_host == '': is_local_fs = True else: is_local_fs = False output_hdfs = hdfs.hdfs(hdfs_host, hdfs_port) file_table.seek(0) for mapid, line in enumerate(file_table.xreadlines()): _, _, relative_output_name = line.rstrip('\n').split('\t') # we expect the map task ids to be assigned in the same order as the input # file list, so we can match the input file to an output file by its position # in the input file list. hadoop_output = os.path.join(self.output_path, "part-%05d" % mapid) + compressor_extension desired_file_name = os.path.join(self.output_path, relative_output_name) + compressor_extension if hadoop_output != desired_file_name: self.log.debug("renaming %s to %s", hadoop_output, desired_file_name) if is_local_fs: # Though we could transparently use hdfs.move for both local fs and hdfs, # using native methods for the local fs should be faster. # os.renames automatically creates necessary parent directories for destination. os.renames(urlparse(hadoop_output).path, urlparse(desired_file_name).path) else: # create the output subdirectory, if necessary dirname = os.path.dirname(relative_output_name) if dirname: output_hdfs.create_directory( os.path.join(self.output_path, dirname) ) if output_hdfs.exists(desired_file_name): raise RuntimeError("Can't overwrite file in output directory: %s" % desired_file_name) output_hdfs.move(hadoop_output, output_hdfs, desired_file_name)
def setUp(self): self.fs = hdfs() self.wd = utils.make_wd(self.fs)
def main(argv): parser = make_parser() opt, args = parser.parse_args() try: input_fasta = args[0] db_archive = args[1] except IndexError: parser.print_help() sys.exit(2) STR_GENERATOR.prefix = os.path.basename(input_fasta) logger = logging.getLogger() for h in logger.handlers: logger.removeHandler(h) opt.log_level_str = opt.log_level opt.log_level = getattr(logging, opt.log_level) kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': opt.log_level} if opt.log_file: kwargs['filename'] = opt.log_file logging.basicConfig(**kwargs) logger.debug("cli args: %r" % (args,)) logger.debug("cli opts: %s" % opt) if opt.mr_dump_file: opt.mr_dump_file = open(opt.mr_dump_file, "w") else: opt.mr_dump_file = sys.stderr if not opt.blast_db: opt.blast_db = os.path.basename(db_archive).split(".", 1)[0] logger.info("--blast-db not provided: setting to %r" % opt.blast_db) os.environ["HADOOP_HOME"] = opt.hadoop_home if not opt.hadoop: opt.hadoop = os.path.join(opt.hadoop_home, "bin/hadoop") if not opt.hadoop_conf_dir: opt.hadoop_conf_dir = os.path.join(opt.hadoop_home, "conf") os.environ["HADOOP_CONF_DIR"] = opt.hadoop_conf_dir hdfs.reset() fs = hdfs.hdfs() logger.debug("hdfs params: host=%s, port=%d" % (fs.host, fs.port)) lfs = hdfs.hdfs("", 0) runner = Runner(fs, lfs, logger) try: db_archive_hdfs = runner.upload_archive(db_archive) blast_input_hdfs = runner.run_f2t(input_fasta, opt) blast_output_hdfs = runner.run_blast(blast_input_hdfs, db_archive_hdfs, opt) runner.collect_output(blast_output_hdfs, opt) logger.info("all done") finally: lfs.close() fs.close() if opt.mr_dump_file is not sys.stderr: opt.mr_dump_file.close()
def setUp(self): self.fs = hdfs.hdfs(self.hdfs_host, self.hdfs_port) self.wd = utils.make_wd(self.fs)