def get_fs(self): return hdfs3.HDFileSystem( host=self.host, port=self.port, pars={ 'input.localread.default.buffersize': '1', 'input.read.default.verify': '0', } )
def store_output_model(self): """ Store tensorflow model stored in local model path to output file path. """ filenames = glob.glob(self.local_model_path + '/*') path_list = self.output_file_path.split(os.path.sep) if (path_list[0].lower() == 'hdfs:'): master, port = path_list[2].split(':') hdfs = hdfs3.HDFileSystem(master, port=int(port), user='******') output_path = '/' + os.path.join(*path_list[3:]) print('local_model_path: {a}'.format(a=self.local_model_path)) print('output_path: {a}'.format(a=self.output_file_path)) if (hdfs.exists(output_path)): hdfs.rm(output_path) for file in filenames: hdfs.mkdir(output_path) if(os.path.isdir(file)): path, filename = os.path.split(file) hdfs.mkdir(output_path + '/' + filename) filesIn2ndLevelFolder = glob.glob(file + '/*') for fileIn2ndLevelFolder in filesIn2ndLevelFolder: path, filenameIn2ndLevelFolder = os.path.split(fileIn2ndLevelFolder) hdfs.put(fileIn2ndLevelFolder, output_path + '/' + filename + '/' + filenameIn2ndLevelFolder, block_size=1048576) else: path, filename = os.path.split(file) hdfs.put(file, output_path + '/' + filename, block_size=1048576) else: print("local_model_path: ", self.local_model_path) print("output_file_path: ", self.output_file_path) shutil.copytree(self.local_model_path, self.output_file_path)
def configure(self, location, host='localhost', port=9000, user=None, ticket_cache=None, token=None, pars=None, connect=True, **kwargs): """Configure the store backend.""" self.storage = hdfs3.HDFileSystem(host=host, port=port, user=user, ticket_cache=ticket_cache, token=token, pars=pars, connect=connect) if location.startswith('/'): location = location[1:] self.cachedir = os.path.join(location, 'joblib') self.storage.mkdir(self.cachedir) # attach required methods using monkey patching trick. self.open_object = self.storage.open self.object_exists = self.storage.exists self.mv = self.storage.mv # computation results can be stored compressed for faster I/O self.compress = (False if 'compress' not in kwargs else kwargs['compress']) # Memory map mode is not supported self.mmap_mode = None
def get_hdfs_connect_handle(): fs = hdfs3.HDFileSystem(host=dare_settings.DARE_LOCATION, port=dare_settings.DARE_PORT, user=dare_settings.DARE_USER) # If dir is already in HDFS, this command does not raise error fs.mkdir(dare_settings.DARE_ROOT_PATHNAME) return fs
def setup_cluster(config): if 'scheduler.ip' not in config: scheduler_ip = socket.gethostbyname(socket.gethostname()) else: scheduler_ip = config['scheduler.ip'] cluster = LocalCluster(n_workers=0, ip=scheduler_ip, port=config['scheduler.port'], diagnostics_port=config['scheduler.bokeh_port']) if hdfs3 is not None: hdfs = hdfs3.HDFileSystem(host=config.get('hdfs.host'), port=config.get('hdfs.port')) else: hdfs = None knit = Knit(hdfs=hdfs, hdfs_home=config.get('hdfs.home'), rm=config.get('yarn.host'), rm_port=config.get('yarn.port')) command = ('$PYTHON_BIN $CONDA_PREFIX/bin/dask-worker ' '--nprocs={nprocs:d} ' '--nthreads={nthreads:d} ' '--memory-limit={memory_limit:d} ' '{scheduler_address} ' '> /tmp/worker-log.out ' '2> /tmp/worker-log.err').format( nprocs=config['worker.processes'], nthreads=config['worker.threads_per_process'], memory_limit=int(config['worker.memory'] * 1e6), scheduler_address=cluster.scheduler.address) app_id = knit.start(command, env=config['cluster.env'], num_containers=config['cluster.count'], virtual_cores=config['worker.cpus'], memory=config['worker.memory'], queue=config['yarn.queue'], app_name='dask', checks=False) # Add a few missing fields to config before writing to disk config2 = config.copy() # The ip is optional, the port may be chosen dynamically config2['scheduler.ip'] = cluster.scheduler.ip config2['scheduler.port'] = cluster.scheduler.port # Fill in optional parameters with auto-detected versions config2['yarn.host'] = knit.conf['rm'] config2['yarn.port'] = knit.conf['rm_port'] config2['hdfs.home'] = knit.hdfs_home # Add in runtime information like app_id and daemon pid config2['application.id'] = app_id config2['application.pid'] = os.getpid() return cluster, knit, config2
def __init__(self, namenode='localhost', namenode_port=8020, dest_dtype=None, replication=3): self.namenode = namenode self.namenode_port = namenode_port self.hdfs = hdfs3.HDFileSystem(namenode, port=namenode_port) self.dest_dtype = dest_dtype self.replication = replication
def get_fs(): import hdfs3 return hdfs3.HDFileSystem('localhost', port=8020, pars={ 'input.localread.default.buffersize': str(1), 'dfs.client.read.shortcircuit': '1', 'input.read.default.verify': '0' })
def get_conn(self) -> hdfs3.core.HDFileSystem: effective_user = self.effective_user connection = self.get_connections(self.hdfs_conn_id)[0] if not self.effective_user: effective_user = connection.login return hdfs3.HDFileSystem(host=connection.host, port=connection.port, user=effective_user)
def _read_data_file_from_hdfs(self, input_file_path, max_row=None): num_rows_to_read = max_row # if max_row is None, it will read all files path_list = input_file_path.split(os.path.sep) master, port = path_list[2].split(':') hdfs = hdfs3.HDFileSystem(master, port=int(port), user=path_list[4]) input_file_path = '/' + os.path.join(*path_list[3:]) with hdfs.open(input_file_path) as f: data = pd.read_csv(f, nrows=num_rows_to_read, header=None) num_rows = data.shape[0] if max_row is not None and num_rows >= max_row: data = data.iloc[:max_row] return data
def get_fs(self): # TODO: maybe this needs to be a context manager, too, so we can do: # with ds.get_fs() as fs: # with fs.open("...") as f: # f.read() return hdfs3.HDFileSystem(host=self.host, port=self.port, pars={ 'input.localread.default.buffersize': '1', 'input.read.default.verify': '0', })
def get_fs(self): # TODO: maybe this needs to be a context manager, too, so we can do: # with reader.get_fs() as fs: # with fs.open("...") as f: # f.read() return hdfs3.HDFileSystem( host=self._host, port=self._port, pars={ 'input.localread.default.buffersize': '1', 'input.read.default.verify': '0', 'dfs.domain.socket.path': '/run/user/1000/hdfs-short-circuit.socket', } )
def clear(): c = CondaCreator() try: yield finally: shutil.rmtree(c.conda_envs) try: k = Knit() import hdfs3 hdfs = hdfs3.HDFileSystem() hdfs.rm(k.hdfs_home, recursive=True) except: pass
def hdfs(request): if request.param == 'hdfs3': hdfs = hdfs3.HDFileSystem(host='localhost', port=8020) else: hdfs = pyarrow.hdfs.connect(host='localhost', port=8020) if hdfs.exists(basedir): hdfs.rm(basedir, recursive=True) hdfs.mkdir(basedir) with dask.config.set(hdfs_driver=request.param): yield hdfs if hdfs.exists(basedir): hdfs.rm(basedir, recursive=True)
def __init__(self, namenode, namenode_port, replication): """ Move data from a local data source to a HDFS filesystem Parameters ---------- namenode : str hostname of the HDFS namenode namenode_port : int port of the HDFS namenode (default 8020) replication : int number of replicas (default 3) """ self.namenode = namenode self.namenode_port = namenode_port self.replication = replication self.hdfs = hdfs3.HDFileSystem(namenode, port=namenode_port)
def main(): module = AnsibleModule( argument_spec=dict( namenode_host=dict(required=True, type='str'), namenode_port=dict(required=False, default=8020, type='int'), effective_user=dict(required=False, default=None, type='str'), state=dict(choices=['file', 'directory', 'touchz', 'absent'], default=None), path=dict(aliases=['dest', 'name'], required=True, type='path'), mode=dict(required=False, default=None, type='raw'), owner=dict(required=False, default=None, type='str'), group=dict(required=False, default=None, type='str'), original_basename=dict( required=False), # Internal use only, for recursive ops recurse=dict(default=False, type='bool'), diff_peek=dict( default=None ), # Internal use only, for internal checks in the action plugins validate=dict( required=False, default=None), # Internal use only, for template and copy src=dict(required=False, default=None, type='path'), ), supports_check_mode=True) # Verify that the HDFS client library is available if not HAS_HDFS3: module.fail_json(msg="Failed to import required python module: hdfs3", details=str(HAS_HDFS3_ERROR)) # Initialise HDFS client try: params = module.params hdfs_client = hdfs3.HDFileSystem(host=params['namenode_host'], port=params['namenode_port'], user=params['effective_user']) run(module, hdfs_client) hdfs_client.disconnect() except ConnectionError: ex = get_exception() module.fail_json( msg='Unable to init HDFS client for %s:%s: %s' % (params['namenode_port'], params['effective_user'], str(ex)))
def read_data_file_from_hdfs(input_file_path, max_row=None): # input_file_path = 'hdfs://csle1:9000/user/leeyh_etri_re_kr/dataset/input/trainset.csv' num_rows_to_read = max_row # if max_row is None, it will read all files # path, filename = os.path.split(input_file_path) # print(path, filename) path_list = input_file_path.split(os.path.sep) print(path_list[0], path_list[2], path_list[4]) master, port = path_list[2].split(':') print(master, port) hdfs = hdfs3.HDFileSystem(master, port=int(port), user=path_list[4]) input_file_path = '/' + os.path.join(*path_list[3:]) with hdfs.open(input_file_path) as f: data = pd.read_csv(f, nrows=num_rows_to_read, header=None) num_rows = data.shape[0] if max_row is not None and num_rows >= max_row: data = data.iloc[:max_row] return data
def hdfs(self): """ An instance of HDFileSystem Useful for checking on the contents of the staging directory. Will be automatically generated using this instance's configuration, but can instead directly set ``self._hdfs`` if necessary. Note: if the namenode/port is not defined in the conf, will not attempt a connection, since it can take a while trying to connect to localhost:8020. """ if self._hdfs is None: try: import hdfs3 par2 = self.conf.copy() par2['host'] = par2.pop('nn') par2['port'] = par2.pop('nn_port') del par2['replication_factor'] del par2['rm_port'] del par2['rm_port_https'] self._hdfs = hdfs3.HDFileSystem(pars=par2) except: self._hdfs = False return self._hdfs
# Documentación de acceso a HDFS desde Python3 # https://readthedocs.org/projects/hdfs3/downloads/pdf/latest/ import hdfs3 from collections import defaultdict, Counter #Conexión a HDFS # revisar la configuración del docker-compose.yml # este es el puerto rpc del namenode de hadoop # por defecto 8020 en la versión 2.7 # pero puede ser otro hdfs = hdfs3.HDFileSystem('localhost', port=8020) """ HDFileSystem([host, port, connect, . . . ]) Connection to an HDFS namenode HDFileSystem.cat(path) Return contents of file HDFileSystem.chmod(path, mode) Change access control of given path HDFileSystem.chown(path, owner, group) Change owner/group HDFileSystem.df() Used/free disc space on the HDFS system HDFileSystem.du(path[, total, deep]) Returns file sizes on a path. HDFileSystem.exists(path) Is there an entry at path? HDFileSystem.get(hdfs_path, local_path[, . . . ]) Copy HDFS file to local HDFileSystem.getmerge(path, filename[, . . . ]) Concat all files in path (a directory) to local output file HDFileSystem.get_block_locations(path[,. . . ]) Fetch physical locations of blocks HDFileSystem.glob(path) Get list of paths mathing glob-like pattern (i.e., with “*”s). HDFileSystem.info(path) File information (as a dict) HDFileSystem.ls(path[, detail]) List files at path HDFileSystem.mkdir(path) Make directory at path HDFileSystem.mv(path1, path2) Move file at path1 to path2 HDFileSystem.open(path[, mode, replication, . . . ]) Open a file for reading or writing HDFileSystem.put(filename, path[, chunk, . . . ]) Copy local file to path in HDFS HDFileSystem.read_block(fn, offset, length) Read a block of bytes from an HDFS file
import random import time import pyarrow as pa import hdfs3 import pandas as pd import seaborn as sns import matplotlib.pyplot as plt DATA_SIZE = 200 * (1 << 20) data = 'a' * DATA_SIZE pa. hdfs = pa.HdfsClient('localhost', 20500, 'wesm') hdfscpp = pa.HdfsClient('localhost', 20500, 'wesm', driver='libhdfs3') hdfs3_fs = hdfs3.HDFileSystem('localhost', port=20500, user='******') hdfs.delete(path) path = '/tmp/test-data-file-1' with hdfs.open(path, 'wb') as f: f.write(data) def read_chunk(f, size): # do a random seek f.seek(random.randint(0, size)) return f.read(size) def ensemble_average(runner, niter=10): start = time.clock() gc.disable() data_chunks = []
def __init__(self, **kwargs): self.fs = hdfs3.HDFileSystem(**kwargs)
def test_promotion(segment_manager_server): hdfs = hdfs3.HDFileSystem(settings['HDFS_HOST'], settings['HDFS_PORT']) hdfs.rm(settings['HDFS_PATH']) hdfs.mkdir(settings['HDFS_PATH']) result = segment_manager_server.get('/promote') assert result.status == '405 METHOD NOT ALLOWED' # provision a test segment for write result = segment_manager_server.post('/provision', content_type='application/json', data=ujson.dumps( {'segment': 'test_promotion'})) assert result.status_code == 200 assert result.mimetype == 'application/json' result_bytes = b''.join(result.response) result_dict = ujson.loads(result_bytes) assert result_dict['write_url'].endswith(':6222/?segment=test_promotion') write_url = result_dict['write_url'] # write something into the db sql = ('create table foo (bar varchar(100));\n' 'insert into foo (bar) values ("testing segment promotion");\n') response = requests.post(write_url, sql) assert response.status_code == 200 # shouldn't be anything in hdfs yet... expected_remote_path = os.path.join(settings['HDFS_PATH'], 'test_promot', 'test_promotion.sqlite') with pytest.raises(FileNotFoundError): hdfs.ls(expected_remote_path, detail=True) # now write to the segment and promote it to HDFS before = time.time() time.sleep(1.5) result = segment_manager_server.post('/promote', content_type='application/json', data=ujson.dumps( {'segment': 'test_promotion'})) assert result.status_code == 200 assert result.mimetype == 'application/json' result_bytes = b''.join(result.response) result_dict = ujson.loads(result_bytes) assert result_dict == {'remote_path': expected_remote_path} # make sure it doesn't think the segment is under promotion rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'], db='trough_configuration') query = rethinker.table('lock').get('write:lock:test_promotion') result = query.run() assert not result.get('under_promotion') # let's see if it's hdfs listing_after_promotion = hdfs.ls(expected_remote_path, detail=True) assert len(listing_after_promotion) == 1 assert listing_after_promotion[0]['last_mod'] > before # grab the file from hdfs and check the content # n.b. copy created by sqlitebck may have different size, sha1 etc from orig size = None with tempfile.TemporaryDirectory() as tmpdir: local_copy = os.path.join(tmpdir, 'test_promotion.sqlite') hdfs.get(expected_remote_path, local_copy) conn = sqlite3.connect(local_copy) cur = conn.execute('select * from foo') assert cur.fetchall() == [('testing segment promotion', )] conn.close() size = os.path.getsize(local_copy) # test promotion when there is an assignment in rethinkdb rethinker.table('assignment').insert({ 'assigned_on': doublethink.utcnow(), 'bytes': size, 'hash_ring': 0, 'id': 'localhost:test_promotion', 'node': 'localhost', 'remote_path': expected_remote_path, 'segment': 'test_promotion' }).run() # promote it to HDFS before = time.time() time.sleep(1.5) result = segment_manager_server.post('/promote', content_type='application/json', data=ujson.dumps( {'segment': 'test_promotion'})) assert result.status_code == 200 assert result.mimetype == 'application/json' result_bytes = b''.join(result.response) result_dict = ujson.loads(result_bytes) assert result_dict == {'remote_path': expected_remote_path} # make sure it doesn't think the segment is under promotion rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'], db='trough_configuration') query = rethinker.table('lock').get('write:lock:test_promotion') result = query.run() assert not result.get('under_promotion') # let's see if it's hdfs listing_after_promotion = hdfs.ls(expected_remote_path, detail=True) assert len(listing_after_promotion) == 1 assert listing_after_promotion[0]['last_mod'] > before # pretend the segment is under promotion rethinker.table('lock')\ .get('write:lock:test_promotion')\ .update({'under_promotion': True}).run() assert rethinker.table('lock')\ .get('write:lock:test_promotion').run()\ .get('under_promotion') with pytest.raises(Exception): result = segment_manager_server.post( '/promote', content_type='application/json', data=ujson.dumps({'segment': 'test_promotion'}))
def ingest_avro(schema_path: str, avro_path: str, target_table: str, host: str, thrift_port: int, hdfs_port: int, username: str): """Ingest Avro data into Hive.""" fs = hdfs3.HDFileSystem(host=host, port=hdfs_port, user=username) schema_basename = os.path.basename(schema_path) hdfs_schema_path = os.path.join(f'/user/{username}', schema_basename) full_hdfs_schema_path = f'hdfs://{host}:{hdfs_port}{hdfs_schema_path}' if fs.exists(hdfs_schema_path): fs.rm(hdfs_schema_path) fs.put(schema_path, hdfs_schema_path, replication=1) avro_basename = os.path.basename(avro_path) hdfs_avro_path = os.path.join(f'/user/{username}', avro_basename) if not fs.exists(hdfs_avro_path): fs.put(avro_path, hdfs_avro_path, replication=1) conn = hive.Connection(host=host, port=thrift_port, username=username, configuration={ 'hive.exec.dynamic.partition.mode': 'nonstrict' }) cursor = conn.cursor() input_fmt = 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' output_fmt = 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' row_format = 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' temp_table_name = avro_basename.replace('.', '_').replace('-', '_') create_temp_table_stmt = f""" CREATE TABLE IF NOT EXISTS {temp_table_name} ROW FORMAT SERDE '{row_format}' STORED AS INPUTFORMAT '{input_fmt}' OUTPUTFORMAT '{output_fmt}' TBLPROPERTIES ('avro.schema.url'='{full_hdfs_schema_path}') """ print(f'--- create_temp_table_stmt ---\n{create_temp_table_stmt}') cursor.execute(create_temp_table_stmt) select_temp_row_stmt = f""" SELECT * FROM {temp_table_name} LIMIT 1 """ print(f'--- select_temp_row_stmt---\n{select_temp_row_stmt}') cursor.execute(select_temp_row_stmt) if cursor.fetchone() is None: load_data_stmt = f""" LOAD DATA INPATH '{hdfs_avro_path}' INTO TABLE {temp_table_name} """ print(f'--- load_data_stmt ---\n{load_data_stmt}') cursor.execute(load_data_stmt) create_target_table_stmt = f""" CREATE EXTERNAL TABLE IF NOT EXISTS {target_table} PARTITIONED BY (ds STRING, h STRING, en STRING) ROW FORMAT SERDE '{row_format}' STORED AS INPUTFORMAT '{input_fmt}' OUTPUTFORMAT '{output_fmt}' TBLPROPERTIES ('avro.schema.url'='{full_hdfs_schema_path}') """ print(f'--- create_target_table_stmt ---\n{create_target_table_stmt}') cursor.execute(create_target_table_stmt) insert_data_stmt = f""" INSERT INTO {target_table} PARTITION (ds, h, en) SELECT *, datestamp AS ds, substr(server_date, 12, 2) AS h, event_name AS en FROM {temp_table_name} """ print(f'--- insert_data_stmt ---\n{insert_data_stmt}') cursor.execute(insert_data_stmt) drop_temp_table_stmt = f""" DROP TABLE {temp_table_name} """ print(f'--- drop_temp_table_stmt ---\n{drop_temp_table_stmt}') cursor.execute(drop_temp_table_stmt)
def test_delete_segment(segment_manager_server): hdfs = hdfs3.HDFileSystem(settings['HDFS_HOST'], settings['HDFS_PORT']) rethinker = doublethink.Rethinker(servers=settings['RETHINKDB_HOSTS'], db='trough_configuration') # initially, segment doesn't exist result = segment_manager_server.delete('/segment/test_delete_segment') assert result.status_code == 404 # provision segment result = segment_manager_server.post( '/provision', content_type='application/json', data=ujson.dumps({'segment': 'test_delete_segment'})) assert result.status_code == 200 assert result.mimetype == 'application/json' result_bytes = b''.join(result.response) result_dict = ujson.loads(result_bytes) assert result_dict['write_url'].endswith( ':6222/?segment=test_delete_segment') write_url = result_dict['write_url'] # write something into the db sql = ('create table foo (bar varchar(100));\n' 'insert into foo (bar) values ("testing segment deletion");\n') response = requests.post(write_url, sql) assert response.status_code == 200 # check that local file exists local_path = os.path.join(settings['LOCAL_DATA'], 'test_delete_segment.sqlite') assert os.path.exists(local_path) # check that attempted delete while under write returns 400 result = segment_manager_server.delete('/segment/test_delete_segment') assert result.status_code == 400 # shouldn't be anything in hdfs yet expected_remote_path = os.path.join(settings['HDFS_PATH'], 'test_delete_segm', 'test_delete_segment.sqlite') with pytest.raises(FileNotFoundError): hdfs.ls(expected_remote_path, detail=True) # promote segment to hdfs result = segment_manager_server.post( '/promote', content_type='application/json', data=ujson.dumps({'segment': 'test_delete_segment'})) assert result.status_code == 200 assert result.mimetype == 'application/json' result_bytes = b''.join(result.response) result_dict = ujson.loads(result_bytes) assert result_dict == {'remote_path': expected_remote_path} # let's see if it's hdfs hdfs_ls = hdfs.ls(expected_remote_path, detail=True) assert len(hdfs_ls) == 1 # add an assignment (so we can check it is deleted successfully) rethinker.table('assignment').insert({ 'assigned_on': doublethink.utcnow(), 'bytes': os.path.getsize(local_path), 'hash_ring': 0, 'id': '%s:test_delete_segment' % socket.gethostname(), 'node': socket.gethostname(), 'remote_path': expected_remote_path, 'segment': 'test_delete_segment' }).run() # check that service entries, assignment exist assert rethinker.table('services')\ .get('trough-read:%s:test_delete_segment' % socket.gethostname())\ .run() assert rethinker.table('services')\ .get('trough-write:%s:test_delete_segment' % socket.gethostname())\ .run() assert rethinker.table('assignment')\ .get('%s:test_delete_segment' % socket.gethostname()).run() # check that attempted delete while under write returns 400 result = segment_manager_server.delete('/segment/test_delete_segment') assert result.status_code == 400 # delete the write lock assert rethinker.table('lock')\ .get('write:lock:test_delete_segment').delete().run() == { 'deleted': 1, 'errors': 0, 'inserted': 0, 'replaced': 0 , 'skipped': 0 , 'unchanged': 0, } # delete the segment result = segment_manager_server.delete('/segment/test_delete_segment') assert result.status_code == 204 # check that service entries and assignment are gone assert not rethinker.table('services')\ .get('trough-read:%s:test_delete_segment' % socket.gethostname())\ .run() assert not rethinker.table('services')\ .get('trough-write:%s:test_delete_segment' % socket.gethostname())\ .run() assert not rethinker.table('assignment')\ .get('%s:test_delete_segment' % socket.gethostname()).run() # check that local file is gone assert not os.path.exists(local_path) # check that file is gone from hdfs with pytest.raises(FileNotFoundError): hdfs_ls = hdfs.ls(expected_remote_path, detail=True)
def train_or_predict(): # FLAGS.input = 'hdfs://csle1:9000/user/leeyh_etri_re_kr/dataset/input/trainset.csv' # FLAGS.output = 'hdfs://csle1:9000/user/leeyh_etri_re_kr/output/models/rnn/0020' # FLAGS.model = '/home/csle/testCodes/models/rnn/0020' # FLAGS.input = 'file:///home/csle/testCodes/input/trainset.csv' # FLAGS.output = 'file:///home/csle/testCodes/models/rnn/0020' # FLAGS.model = 'file:///home/csle/testCodes/models/rnn/0020' # FLAGS.input = '/home/csle/testCodes/input/trainset.csv' # FLAGS.output = '/home/csle/testCodes/models/rnn/0020' # FLAGS.model = '/home/csle/testCodes/models/rnn/0020' (root_path, sep, input_data_path) = FLAGS.input.rpartition('/') (checkpoint_dir, sep, model_path) = FLAGS.model.rpartition('/') (train_accuracy_dir, sep, train_accuracy_path) = FLAGS.output.rpartition('/') if FLAGS.isTrain: datasets, num_links = preproc.load_processed_data( FLAGS.isTrain, FLAGS.num_train, FLAGS.num_validation, FLAGS.num_test, FLAGS.input, FLAGS.num_steps, FLAGS.elapse_steps) else: datasets, num_links = preproc.load_processed_data( FLAGS.isTrain, FLAGS.num_predict, 0, 0, FLAGS.input, FLAGS.num_steps, FLAGS.elapse_steps) if (FLAGS.num_links > num_links): FLAGS.num_links = num_links if (FLAGS.num_outputs > FLAGS.num_links): FLAGS.num_outputs = FLAGS.num_links if 'sess' in globals(): sess.close() tf.reset_default_graph() sess = tf.InteractiveSession() x = tf.placeholder(tf.float32, [FLAGS.batch_size, FLAGS.num_steps, FLAGS.num_links], name='input_placeholder') y = tf.placeholder(tf.float32, [FLAGS.batch_size, FLAGS.num_outputs], name='labels_placeholder') loss_weights = tf.placeholder(tf.float32, [FLAGS.batch_size]) keep_prob = tf.placeholder(tf.float32) cell = tf.contrib.rnn.BasicLSTMCell(FLAGS.state_size, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) init_state = cell.zero_state(FLAGS.batch_size, tf.float32) rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, x, initial_state=init_state) rnn_last_outputs = rnn_outputs[:, FLAGS.num_steps - 1, :] # Output Layers with tf.variable_scope('fully_connected_0'): W = tf.get_variable('W', [FLAGS.state_size, FLAGS.state_size]) b = tf.get_variable('b', [FLAGS.state_size], initializer=tf.constant_initializer(0.0)) fc_outputs_0 = tf.nn.elu(tf.matmul(rnn_last_outputs, W) + b) fc_dropout_0 = tf.nn.dropout(fc_outputs_0, keep_prob) with tf.variable_scope('fully_connected_1'): W = tf.get_variable('W', [FLAGS.state_size, FLAGS.state_size]) b = tf.get_variable('b', [FLAGS.state_size], initializer=tf.constant_initializer(0.0)) fc_outputs_1 = tf.nn.elu(tf.matmul(fc_dropout_0, W) + b) fc_dropout_1 = tf.nn.dropout(fc_outputs_1, keep_prob) with tf.variable_scope('fully_connected_2'): W = tf.get_variable('W', [FLAGS.state_size, FLAGS.num_outputs]) b = tf.get_variable('b', [FLAGS.num_outputs], initializer=tf.constant_initializer(0.0)) # final outputs, predictions outputs = tf.nn.elu(tf.matmul(fc_dropout_1, W) + b) unscaled_output = 100 * (outputs + 0.5) mse = tf.square(y - outputs) mse = tf.reduce_mean(mse, reduction_indices=[1]) total_loss = tf.reduce_sum(mse * loss_weights) train_step = tf.train.AdamOptimizer( FLAGS.learning_rate).minimize(total_loss) saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) if FLAGS.isTrain: print('Tensorflow train job started !') num_examples = FLAGS.num_train - FLAGS.num_steps - FLAGS.elapse_steps + 1 pbar = PrgBar(FLAGS.num_epoch, num_examples) p = os.path.join(FLAGS.model, FLAGS.model_filename) local_model_path = p[:p.rindex(os.path.sep)] local_path_list = local_model_path.split(os.path.sep) if (local_path_list[0].lower() == 'file:'): local_model_path = '/' + os.path.join(*local_path_list[3:]) os.makedirs(local_model_path, exist_ok=True) for idx in range(FLAGS.num_epoch): last_batch = False while (last_batch is False): X, Y, last_batch, lw, num_samples = datasets.train.next_batch( FLAGS.batch_size, FLAGS.num_links, FLAGS.num_outputs) loss_, last_outputs, est, _ = \ sess.run([total_loss, rnn_last_outputs, unscaled_output, train_step], \ feed_dict={x: X, y: Y, loss_weights: lw, keep_prob: FLAGS.dropout}) pbar.log(num_samples, loss_) sys.stdout.flush() if (idx + 1) % FLAGS.checkpoint_steps == 0: saver.save(sess, p) print(pbar.losses) if (idx + 1) % FLAGS.checkpoint_steps != 0: saver.save(sess, p) model_path = FLAGS.model path_list = model_path.split(os.path.sep) if (path_list[0].lower() == 'file:'): model_path = '/' + os.path.join(*path_list[3:]) filenames = glob.glob(model_path + '/*') # /home/csle/testCodes/models/rnn/0019/* output_file_path = FLAGS.output path_list = output_file_path.split(os.path.sep) if (path_list[0].lower() == 'hdfs:'): # 'hdfs://csle1:9000/user/leeyh_etri_re_kr/output/models/rnn/0019' master, port = path_list[2].split(':') hdfs = hdfs3.HDFileSystem(master, port=int(port), user='******') output_path = '/' + os.path.join(*path_list[3:]) if (hdfs.exists(output_path)): hdfs.rm(output_path) for file in filenames: hdfs.mkdir(output_path) path, filename = os.path.split(file) hdfs.put(file, output_path + '/' + filename, block_size=1048576) print(hdfs.ls(output_path)) p = os.path.join(train_accuracy_dir, train_accuracy_path) local_p = os.path.join(local_model_path, train_accuracy_path) accuracy_file = open(local_p, 'w') accuracy_file.write("%.7f" % pbar.getAverageLoss()) accuracy_file.close() hdfs.put(local_p, output_path + '/' + train_accuracy_path, block_size=1048576) else: # Here's where you're restoring the variables w and b. # Note that the graph is exactly as it was when the variables were # saved in a prior training run. print('Tensorflow prediction job started !') ckpt = tf.train.get_checkpoint_state(FLAGS.model) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print('Restored!', end="\r") predictions = evaluate_network(sess, datasets.train, x, y, total_loss, unscaled_output, loss_weights, keep_prob) # if not os.path.exists(os.path.join(FLAGS.output_predict_dir, FLAGS.output_predict_path)): p = os.path.join(FLAGS.output_predict_dir, FLAGS.output_predict_path) os.makedirs(p[:p.rindex(os.path.sep)], exist_ok=True) predict_file = open(p, 'w') for predicts in predictions: predicts[predicts < 0] = 0.0 strings = ["%.2f" % predict for predict in predicts] predict_file.write(",".join(strings)) predict_file.write("\n") print(",".join(strings), end="\r") sys.stdout.flush() predict_file.close() else: print('No checkpoint found!')