def hdfs_open_file(path, auth): from hdfs3 import HDFileSystem hdfs = HDFileSystem(**auth) return hdfs.open(path, mode='rb')
def handle(q): hdfs = HDFileSystem(host='localhost', port=8020) q.put(hdfs._handle.contents.filesystem)
def test_different_handles(): a = HDFileSystem(host=test_host, port=test_port) b = HDFileSystem(host=test_host, port=test_port) assert a._handle.contents.filesystem != b._handle.contents.filesystem
#!/home/cloudera/anaconda3/bin/python3.7 ## import de la librairie HDFS3 import hdfs3 #import du dictionnaire de la librairie collections from collections import defaultdict, Counter from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='localhost', port=8020) #affiche la liste des fichiers disponible dans le répertoire filenames = hdfs.glob('/data_in') print('Liste des fichiers :') print(filenames) ##fonction permet de compter le nombre de mot d'un fichier def count_words(file): word_counts = defaultdict(int) for line in file: #utilisation de la fonction decode pour l'encodage line = line.decode('utf8').strip() for word in line.split(): word_counts[word] += 1 return word_counts ##fonction permet de compter le nombre dans l'ensemble des fichiers disponibles dans le répertoire data_in all_counts = Counter() for fn in filenames: ##lecture fichiers avec hfds with hdfs.open(fn) as f:
import pyspark from pyspark.sql import SparkSession, Row sc = SparkSession.builder.master("local[*]").getOrCreate() import pandas as pd from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='horton') hdfsS = HDFileSystem(host='stampy') data = sc.read.csv('/user/rynguyen/state_info.csv') #Doesnt include header data.show() # shows headers as _c0 , _c01 because header is not = True data.printSchema() # prints column settings" data.select("_c0").show() data.registerTempTable("Data") sc.sql("Select * from Data where _c0 = 'California'").show() # Optimal way of reading data files into data frame df = sc.read.format('csv').options(header='true', inferSchema='true').load('/user/rynguyen/state_info.csv') df.show() # includes headers df.createOrReplaceTempView("data") # Returns results as data frame" Houseseats2 = sc.sql("select * from data where HouseSeats = 2").show() #data frame query" df.createGlobalTempView("data") #Temporary view that is shared among all sessions and keep alive until the Spark application terminates" frame1 = sc.sql("Select * from global_temp.data where HouseSeats = 2").show()
def __init__(self, host, port, user): self.hdfs = HDFileSystem(host=host, port=port, user=user)
#!/usr/bin/python #imports import io import pandas as pd import os import sys from hdfs3 import HDFileSystem inputF = sys.argv[1] outputF = sys.argv[2] hdfs = HDFileSystem(host='bdhKC', port=9000) # Metodos necesarios def LimpiarBarrio(pBarrio): if (pBarrio[0].isdigit()): return pBarrio[4:] else: return pBarrio #Visualizamos que tenemos with hdfs.open(inputF) as f: df = pd.read_csv(f, header=[0, 1], delimiter=';', nrows=22) #corregimos los nombres de las columnas as_list = df.columns.tolist() as_list[0] = ''
def hdfs_open_file(path, auth): hdfs = HDFileSystem(**auth) return hdfs.open(path, mode='rb')
def open_file_write(paths, hdfs=None, **kwargs): """ Open list of files using delayed """ if hdfs is None: hdfs = HDFileSystem(kwargs.get('host'), kwargs.get('port')) out = [delayed(hdfs.open)(path, 'wb') for path in paths] return out
def setUpClass(cls): super().setUpClass() # kafka configuration kafka_test_utils_clz = cls.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass('org.apache.spark.streaming.kafka.KafkaTestUtils') cls._kafkaTestUtils = kafka_test_utils_clz.newInstance() cls._kafkaTestUtils.setup() ezk = cls._kafkaTestUtils.getClass().getDeclaredField('zookeeper') ezk.setAccessible(True) zk = ezk.get( cls._kafkaTestUtils).getClass().getDeclaredField('zookeeper') zk.setAccessible(True) zk.get(ezk.get(cls._kafkaTestUtils)).getServerCnxnFactory( ).setMaxClientCnxnsPerHost(100) # hbase configuration hbase_testing_utility_clz = cls.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass('org.apache.hadoop.hbase.HBaseTestingUtility') cls._hbaseTestingUtility = hbase_testing_utility_clz.newInstance() cls._hbaseTestingUtility.getConfiguration().setBoolean( 'hbase.table.sanity.checks', False) # for thrift cls._hbaseTestingUtility.getConfiguration().set( 'hbase.zookeeper.property.clientPort', cls._kafkaTestUtils.zkAddress().split(':')[1]) cls._hbaseTestingUtility.startMiniDFSCluster(1) cls._hbaseTestingUtility.startMiniHBaseCluster(1, 1) # cls._hbaseTestingUtility.startMiniCluster() # thrift server configuration thrift_server_clz = cls.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass('org.apache.hadoop.hbase.thrift.ThriftServer') cArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Class, 1) cArgs[0] = cls._hbaseTestingUtility.getConfiguration().getClass() iArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Object, 1) iArgs[0] = cls._hbaseTestingUtility.getConfiguration() cls._thriftServer = thrift_server_clz \ .getDeclaredConstructor(cArgs) \ .newInstance(iArgs) tArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.String, 5) port = cls._hbaseTestingUtility.randomFreePort() cls.thrift_port = port tArgs[0] = "-port" tArgs[1] = str(port) tArgs[2] = "-infoport" info_port = cls._hbaseTestingUtility.randomFreePort() tArgs[3] = str(info_port) tArgs[4] = "start" mArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Class, 1) mArgs[0] = tArgs.getClass() method = thrift_server_clz.getDeclaredMethod('doMain', mArgs) method.setAccessible(True) args = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Object, 1) args[0] = tArgs cls.thrift_server_thread = threading.Thread( target=method.invoke, args=[cls._thriftServer, args]) cls.thrift_server_thread.setDaemon(True) cls.thrift_server_thread.start() time.sleep(5) cls._hbaseTestingUtility.getMiniHBaseCluster( ).waitForActiveAndReadyMaster(60000) # test topics cls._kafkaTestUtils.createTopic(cls.test_topic_1) cls._kafkaTestUtils.createTopic(cls.test_topic_2) cls._kafkaTestUtils.createTopic('test_topic1') cls._kafkaTestUtils.createTopic('test_topic2') # HBase configuration connection = happybase.Connection(port=cls.thrift_port) connection.create_table(DummyProcessor.TABLE_NAME, {'dummy': dict()}) connection.create_table('test_' + DummyProcessor.TABLE_NAME, {'dummy': dict()}) # TODO Add table for coprocessor tests #HDFS configuration (uses for co-processors) port = cls._hbaseTestingUtility.getDFSCluster().getNameNodePort() hdfs = HDFileSystem(host='localhost', port=port, pars={'dfs.client.read.shortcircuit': 'false'}) # TODO change path to real test co-prc. hdfs.put( '../../../../hbase-coprocessors/target/scala-2.10/hbase-coprocessors.jar', '/hbase-coprocessors.jar') # streaming engine test configuration cls.engine_config = { 'socketServer.port': 4444, # port of socket server 'hadoop.dfs.url': 'hdfs://localhost:{}/'.format( cls._hbaseTestingUtility.getDFSCluster().getFileSystem( ).getUri().getPort()), 'hadoop.conf': { 'hbase.table.sanity.checks': False, 'hbase.zookeeper.property.clientPort': cls._kafkaTestUtils.zkAddress().split(':')[1] }, 'zookeeper.zkQuorum': cls._kafkaTestUtils.zkAddress(), 'kafka.groupId': 'test-streaming-consumer', 'kafka.params': { 'auto.offset.reset': 'largest' }, 'hbase.host': 'localhost', 'hbase.port': cls.thrift_port, 'processor.dir': 'processors', 'processor.joinWindow': 20, 'processor.poolExecutors': 1 }
from enum import Enum import pandas as pd from io import StringIO import time from IPython.core.display import display, HTML from IPython.core.magic import Magics, magics_class, cell_magic, line_magic, \ needs_local_scope, register_cell_magic from IPython.core.magic_arguments import argument, magic_arguments, \ parse_argstring from hdfs3 import HDFileSystem horton = HDFileSystem(host='horton') stampy = HDFileSystem(host='stampy') tahoe = HDFileSystem(host='tahoe') @magics_class class mymagics(Magics): @register_cell_magic @magic_arguments() # @line_magic # @cell_magic @argument("-s", "--save", type=str, help="dataframe to be saved into csv format") def cdataframe(line, cell): sio = StringIO(cell) data = pd.read_csv(sio) return data
# !/usr/bin/env python # -*- coding:utf-8 -*- # https://www.cnblogs.com/shoufengwei/p/5949791.html from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='XX.XX.XX.XX', port=9000) def mkdir(remotepath): if not exists(remotepath): hdfs.mkdir(dir) def get(remotepath, localpath): if exists(remotepath): hdfs.get(remotepath, localpath) # def put(localfile, remotefile): # dir = getDirPath(remotefile) # mkdir(dir) # hdfs.put(localfile, remotefile) def exists(remotepath): return hdfs.exists(remotepath) def delete(remotepath): if exists(remotepath):
#!/bin/env python # -*- coding: utf-8 -*- import os.path from hdfs3 import HDFileSystem import config print 'NameNode host:', config.NAMENODE_HOST print 'NameNode port:', config.NAMENODE_PORT client = HDFileSystem(host=config.NAMENODE_HOST, port=config.NAMENODE_PORT) remote_dir = os.path.dirname(config.RFILE_FMT) if not client.exists(remote_dir): client.mkdir(remote_dir) for day in range(0, config.DAYS): src = "".join([config.LFILE_FMT, str(day)]) dst = "".join([config.RFILE_FMT, str(day)]) if not os.path.exists(src): print 'Skipping:', src, 'file not found!' continue if client.exists(dst): print 'Skipping:', dst, 'file already exists: hadoop fs -rm', dst continue print 'Uploading', src, '=>', dst client.put(src, dst)
def test_default_port_and_host(no_conf): guess_config() hdfs = HDFileSystem(connect=False) assert hdfs.host == conf_defaults['host'] assert hdfs.port == conf_defaults['port']
# read from mrbox.conf config = configparser.ConfigParser() config.read(config_filepath) # local folder properties local_folder = customize_path(config['User']['localPath'], 'mrbox') local_file_size_limit_MB = config['User']['localFileSizeMB'] remote_folder = customize_path(config['User']['hdfsPath'], 'mrbox') if not os.path.exists(local_folder): os.mkdir(local_folder) local_file_size_limit_bytes = bytes_to_mb(int(local_file_size_limit_MB)) local = MRBoxObject(local_folder, local_file_size_limit_bytes, remote_folder) # connect to hdfs and create hadoop interface, todo: check how to create list of multiple hadoops hdfs_con = HDFileSystem(host=config['User']['hdfsHost'], port=config['User'].getint('hdfsPort')) hadoop_path = config['User']['hadoopPath'] hdfs_con.mkdir(remote_folder) hadoop = HadoopInterface(hdfs_con, hadoop_path) # create sqlite db full_db_path = os.path.join(config['User']['localPath'], config['User']['dbFile']) lc = LocalCatalog(full_db_path) # todo: run sync thread for initial consistency with local folder # create thread to monitor /mrbox directory and log events generated logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
def open_file_write_direct(path, hdfs=None, **kwargs): if hdfs is None: hdfs = HDFileSystem(kwargs.get('host'), kwargs.get('port')) return hdfs.open(path, 'wb')
def test_default_port_and_host(): hdfs = HDFileSystem(connect=False) assert hdfs.host == DEFAULT_HOST assert hdfs.port == DEFAULT_PORT
def read_bytes(path, client=None, hdfs=None, lazy=True, delimiter=None, not_zero=False, sample=True, blocksize=None, compression=None, **hdfs_auth): """ Convert location in HDFS to a list of distributed futures Parameters ---------- path: string location in HDFS client: Client (optional) defaults to most recently created client hdfs: HDFileSystem (optional) lazy: boolean (optional) If True then return lazily evaluated dask Values delimiter: bytes An optional delimiter, like ``b'\n'`` on which to split blocks of bytes not_zero: force seek of start-of-file delimiter, discarding header **hdfs_auth: keyword arguments Extra keywords to send to ``hdfs3.HDFileSystem`` Returns ------- List of ``distributed.Future`` objects if ``lazy=False`` or ``dask.Value`` objects if ``lazy=True`` """ if compression: raise NotImplementedError("hdfs compression") hdfs = hdfs or HDFileSystem(**hdfs_auth) client = default_client(client) blocks = get_block_locations(hdfs, path) filenames = [d['filename'] for d in blocks] offsets = [d['offset'] for d in blocks] if not_zero: offsets = [max([o, 1]) for o in offsets] lengths = [d['length'] for d in blocks] workers = [[h.decode() for h in d['hosts']] for d in blocks] logger.debug("Read %d blocks of binary bytes from %s", len(blocks), path) if sample is True: sample = 10000 if sample: with hdfs.open(filenames[0], 'rb') as f: sample = f.read(sample) else: sample = b'' f = delayed(read_block_from_hdfs, pure=True) values = [ f(fn, offset, length, hdfs.host, hdfs.port, delimiter) for fn, offset, length in zip(filenames, offsets, lengths) ] restrictions = {v.key: w for v, w in zip(values, workers)} client._send_to_scheduler({ 'op': 'update-graph', 'tasks': {}, 'dependencies': [], 'keys': [], 'restrictions': restrictions, 'loose_restrictions': list(restrictions), 'client': client.id }) return sample, values
from hdfs3 import HDFileSystem cli = HDFileSystem(host="192.168.3.140", post=8020) cli.ls("/")
def read_bytes(fn, executor=None, hdfs=None, lazy=True, delimiter=None, not_zero=False, **hdfs_auth): """ Convert location in HDFS to a list of distributed futures Parameters ---------- fn: string location in HDFS executor: Executor (optional) defaults to most recently created executor hdfs: HDFileSystem (optional) lazy: boolean (optional) If True then return lazily evaluated dask Values delimiter: bytes An optional delimiter, like ``b'\n'`` on which to split blocks of bytes not_zero: force seek of start-of-file delimiter, discarding header **hdfs_auth: keyword arguments Extra keywords to send to ``hdfs3.HDFileSystem`` Returns ------- List of ``distributed.Future`` objects if ``lazy=False`` or ``dask.Value`` objects if ``lazy=True`` """ from hdfs3 import HDFileSystem hdfs = hdfs or HDFileSystem(**hdfs_auth) executor = default_executor(executor) blocks = get_block_locations(hdfs, fn) filenames = [d['filename'] for d in blocks] offsets = [d['offset'] for d in blocks] if not_zero: offsets = [max([o, 1]) for o in offsets] lengths = [d['length'] for d in blocks] workers = [[h.decode() for h in d['hosts']] for d in blocks] names = [ 'read-binary-hdfs3-%s-%s' % (fn, tokenize(offset, length, delimiter, not_zero)) for fn, offset, length in zip(filenames, offsets, lengths) ] logger.debug("Read %d blocks of binary bytes from %s", len(blocks), fn) if lazy: restrictions = dict(zip(names, workers)) executor._send_to_scheduler({ 'op': 'update-graph', 'tasks': {}, 'dependencies': [], 'keys': [], 'restrictions': restrictions, 'loose_restrictions': names, 'client': executor.id }) values = [ Value(name, [{ name: (read_block_from_hdfs, fn, offset, length, hdfs.host, hdfs.port, delimiter) }]) for name, fn, offset, length in zip(names, filenames, offsets, lengths) ] return values else: return executor.map(read_block_from_hdfs, filenames, offsets, lengths, host=hdfs.host, port=hdfs.port, delimiter=delimiter, workers=workers, allow_other_workers=True)
# In[ ]: label_vocabulary = ['ad.', 'nonad.'] target = '1558' cols_categorical = ['0', '1', '2', '3'] default_value = [[0]] + [[""]] * 4 + [[0.]] * 1554 + [[ 'nonad.' ]] #cols ['0', '1', '2', '3'] treated as categorical # ### get feature/label column names # In[ ]: from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='172.17.0.2', port=9000) #namenode and port with hdfs.open(training_data_pandas) as f: df = pd.read_csv(f, delimiter=delim, index_col=0, skipinitialspace=True, nrows=0) feature_cols = {} idx = 1 for col in df.columns: feature_cols[col] = idx idx += 1 label_cols = {target: feature_cols.pop(target)}
def test_sync_segment_freshness(self): sync.init(self.rethinker) with tempfile.TemporaryDirectory() as tmp_dir: self.rethinker.table('lock').delete().run() self.rethinker.table('assignment').delete().run() self.rethinker.table('services').delete().run() controller = self.make_fresh_controller() controller.local_data = tmp_dir assert controller.healthy_service_ids == set() # make segment 4 a segment of interest with open(os.path.join(tmp_dir, '4.sqlite'), 'wb'): pass controller.sync() assert controller.healthy_service_ids == {'trough-read:test01:4'} # create a write lock lock = sync.Lock.acquire(self.rethinker, 'trough-write:test01:4', {'segment': '4'}) controller.sync() assert controller.healthy_service_ids == { 'trough-read:test01:4', 'trough-write:test01:4' } locks = list(self.rethinker.table('lock').run()) assert len(locks) == 1 assert locks[0]['id'] == 'trough-write:test01:4' self.rethinker.table('lock').delete().run() self.rethinker.table('assignment').delete().run() # clean slate with tempfile.TemporaryDirectory() as tmp_dir: hdfs = HDFileSystem(host=controller.hdfs_host, port=controller.hdfs_port) hdfs.rm(controller.hdfs_path, recursive=True) hdfs.mkdir(controller.hdfs_path) with hdfs.open(os.path.join(controller.hdfs_path, '5.sqlite'), 'wb', replication=1) as f: f.write('y' * 1024) self.rethinker.table('lock').delete().run() self.rethinker.table('assignment').delete().run() self.rethinker.table('services').delete().run() controller = self.make_fresh_controller() controller.local_data = tmp_dir # create an assignment without a local segment assignment = sync.Assignment(self.rethinker, d={ 'hash_ring': 'a', 'node': 'test01', 'segment': '5', 'assigned_on': r.now(), 'bytes': 0, 'remote_path': os.path.join( controller.hdfs_path, '5.sqlite') }) assignment.save() lock = sync.Lock.acquire(self.rethinker, 'write:lock:5', {'segment': '5'}) assert len(list(self.rethinker.table('lock').run())) == 1 controller.healthy_service_ids.add('trough-write:test01:5') controller.healthy_service_ids.add('trough-read:test01:5') controller.sync() assert controller.healthy_service_ids == {'trough-read:test01:5'} assert list(self.rethinker.table('lock').run()) == [] # clean up hdfs.rm(controller.hdfs_path, recursive=True) hdfs.mkdir(controller.hdfs_path) # third case: not assigned, local file exists, is older than hdfs # this corresponds to the situation where we have an out-of-date # segment on disk that was probably a write segment before it was # reassigned when it was pushed upstream with tempfile.TemporaryDirectory() as tmp_dir: # create a local segment without an assignment with open(os.path.join(tmp_dir, '6.sqlite'), 'wb'): pass time.sleep(2) # create file in hdfs with newer timestamp hdfs = HDFileSystem(host=controller.hdfs_host, port=controller.hdfs_port) hdfs.rm(controller.hdfs_path, recursive=True) hdfs.mkdir(controller.hdfs_path) with hdfs.open(os.path.join(controller.hdfs_path, '6.sqlite'), 'wb', replication=1) as f: f.write('z' * 1024) self.rethinker.table('lock').delete().run() self.rethinker.table('assignment').delete().run() self.rethinker.table('services').delete().run() controller = self.make_fresh_controller() controller.local_data = tmp_dir controller.healthy_service_ids.add('trough-write:test01:6') controller.healthy_service_ids.add('trough-read:test01:6') controller.sync() assert controller.healthy_service_ids == set() # clean up hdfs.rm(controller.hdfs_path, recursive=True) hdfs.mkdir(controller.hdfs_path)
f.write(data) with hdfs_client.open(file_a, 'rb') as f: out = f.read(len(data)) assert out == data def hdfs_readlines(hdfs_client): file_b = '/tmp/test/file_b' with hdfs_client.open(file_b, 'wb', replication=1) as f: f.write(b"hello\nhadoop") with hdfs_client.open(file_b, 'rb') as f: lines = f.readlines() assert len(lines) == 2 if __name__ == '__main__': hdfs_client = HDFileSystem(host=test_host, port=test_port) hdfs_exists(hdfs_client) hdfs_write_read(hdfs_client) hdfs_readlines(hdfs_client) hdfs_client.disconnect() print("-" * 20) print("hello hadoop")
import pandas as pd import numpy as np import gc import gzip from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='horton') def entropy1(x): #用这个! # x is pd.Series, and already pd.cut counts = x.value_counts(normalize=True, dropna=False).values #ndarry return -counts.dot(np.log(counts + np.e**-100)) # 读取数据, 该数据集中所有的Variable都是Category类型的 path = "/user/runyu/data/masked/TSD/join_result_37_smp2_one_final_bins.csv" #hdfs with hdfs.open(path) as f: df = pd.read_csv(f, low_memory=False, na_values=['.', ' ', ''], keep_default_na=False, skipinitialspace=True, memory_map=True) target = df['mm18_bad'] #已经修复了 rtn = pd.DataFrame() rtn['name'] = df.columns print("start cal own entropy ...") # 计算自身的熵 for col_name in df.columns:
def test_different_handles(): a = HDFileSystem(host='localhost', port=8020) b = HDFileSystem(host='localhost', port=8020) assert a._handle.contents.filesystem != b._handle.contents.filesystem
# lock.acquire() return elp.save_as_comments(text, HDFS) # lock.release() #处理session els = Elong_session() def get_info(hid, n): return els.get_info(hid, n) #获取数据 elc = Elong_spyder() def get_data(json, hid): return elc.get_data(json, hid) #处理队列数据 def deal_queue(text): queue_data.put(text) queue_data.task_done() global HDFS HDFS = HDFileSystem(host='192.168.100.178', port=8020) #******************************************************************************************************************
from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='trevally.amer.nevint.com', port=9000)
""" ''' Т.к. `libhdfs3` - это библиотека для C/C++, то будем использовать обертку для нее под Python - `hdfs3` https://hdfs3.readthedocs.io/en/latest/ Модуль поддерживает и Python2 и Python3, поэтому будем работать с дефолтной версией интерпретатора Проверяем установлена ли библиотека `hdfs3` pydoc modules Библиотеки не оказлось - устанавливаем pip install hdfs3 Выясняем адрес и порт для запросов hdfs getconf -confKey fs.defaultFS Дальше запускаем терминал Python и работаем в нем (как вариант - можно подготовить скрипт и запускать его) ''' from hdfs3 import HDFileSystem hdfs = HDFileSystem(host='manager.novalocal', port=8020) ''' И здесь он нам говорит: "Can not find the shared library: libhdfs3.so" и предлагает установить ее по инструкции http://hdfs3.readthedocs.io/en/latest/install.html, ок - пробуем установить yum install libhdfs3 libhdfs3-dev а он нам: "You need to be root to perform this command." финиталякомедия '''
def handle(q): hdfs = HDFileSystem(host=test_host, port=test_port) q.put(hdfs._handle.contents.filesystem)
# READ THE OBJECTS ONE PAGE AT A SIZE for evol in range( 1, count, PAGE_SIZE ): this_evol_link = link + "/?limit=25" + str(evol) + "&offset=" + str(PAGE_SIZE) evol_output.write( requests.get(this_evol_link).text ) contents = evol_output.getvalue() # WRITE RESULTS TO OUTPUT FILE local_path = "/home/jeff/output/pokemon-site.output" output_file = open( local_path, "w+" ) output_file.write( contents ) output_file.close() # CLOSE DATA BUFFER evol_output.close() # WRITE RESULTS TO HDFS FILE # SETUP from hdfs3 import HDFileSystem hdfs_host = "localhost" hdfs_port = 50502 hdfs_path = "/user/pokemon-site.output" # FILE MOVE hdfs_sys = HDFileSystem( host=hdfs_host, port=hdfs_port ) #hdfs_sys.put( local_path, hdfs_path ) with hdfs_sys.open( hdfs_path, 'wb' ) as output_file: output_file.write(contents)