def read_result_from_hdfs(username): result = "" to_return = {} file_path = "/jobs_done/" + username + "/part-00000" logger.debug("Reading file " + file_path + " from HDFS") try: logger.debug("Trying to connect to " + hdfs_namenodes[0] + " namenode") hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[0], port='50070', user_name='xnet', timeout=100) result = hdfs_client.read_file(file_path) except (ActiveHostNotFound, ConnectionError) as e: to_return["details_1"] = str(e) try: logger.debug("Trying to connect to " + hdfs_namenodes[1] + " namenode") hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[1], port='50070', user_name='xnet', timeout=100) result = hdfs_client.read_file(file_path) except (ActiveHostNotFound, ConnectionError) as e2: to_return[ "error"] = "There was a problem while trying to read result from HDFS." to_return["details2"] = str(e2) logger.debug(str(to_return)) return False, to_return return True, result
def main(argv): """ Main method. This method performs the following tasks: 1. Parse command line arguments 2. Retrieve credentials and connect to Cloudant and WebHDFS 3. Connect to the Cloudant `_changes` feed for checkpointed document consumption 4. Process each change individually. 5. Upon exception throwing, store the latest checkpoint to local file and exit. """ #add options into the parser parser = configureOptions() (options, args) = parser.parse_args() checkRequiredArguments(options, parser) print options # configurations last_seq = options.last_seq #get credential perm_file = '%s/.clou' % os.environ['HOME'] creds = get_creds(perm_file) #connect to source database s = Server('https://%s:%s@%s' % (creds['cloudant_user'], creds['cloudant_pwd'], options.uri)) db = s[options.dbname] #print db.info() #connect to target hdfs cluster hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds['hdfs_user']) hdfs.make_dir(options.hdfs_path) #and here we consume the cloudant `_changes` feed counter = 0 changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq) for c in changestream: #print c try: if counter % 100 == 0: checkpoint(last_seq) seq = processChange(hdfs, c, options.hdfs_path) if seq: # protect against the last line being blank last_seq = seq counter += 1 except Exception: traceback.print_exc() checkpoint(last_seq) os._exit(1) checkpoint(last_seq)
def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None, overwrite=False): # extract some information from the function if udf_name is None: udf_name = function.name symbol = function.llvm_func.name ir = function.llvm_module.to_bitcode() return_type = udf_to_impala_type[function.signature.return_type.name] arg_types = [udf_to_impala_type[arg.name] for arg in function.signature.args[1:]] # ship the IR to the cluster hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port, user_name=ic._hdfs_user) if hdfs_path is None: hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll') if not hdfs_path.endswith('.ll'): raise ValueError("The HDFS file name must end with .ll") hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite) # register the function in Impala if database is None: database = ic._temp_db impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types)) if overwrite: ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name) register_query = ("CREATE FUNCTION %s RETURNS %s " "LOCATION '%s' SYMBOL='%s'") % (impala_name, return_type, hdfs_path, symbol) ic._cursor.execute(register_query)
def upload_file(): """ Upload file --- tags: - Files consumes: "multipart/form-data" parameters: - name: file in: formData required: true paramType: body dataType: file type: file responses: 200: description: Return a successful message 401: description: Unauthorized 400: description: Bad Request 500: description: Server Internal error """ # hard-code config information. You should imporove it. hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085') if request.method == 'POST': file = request.files['file'] if file and allowed_file(file.filename): filename = secure_filename(str(time.time()) + file.filename) my_file = 'tmp/thanhson1085/data/' + filename hdfs.create_file(my_file, file) return jsonify({'success':'true'}) return jsonify({'success':'false'})
def solarLog_call(epoch_time): conn = http.client.HTTPConnection("") r = requests.get(" http://winsun.solarlog-web.ch/api?cid=" + pfadheimBaarCID + "&locale=de_ch&username=277555406&password=5a03cdf0a3ff42de09bc85361d8a2f0f&function=dashboard&format=jsonh&solarlog=9112&tiles=Yield|true,Grafic|true,Env|true,Weather|true&ctime=" + epoch_time) logging.info("Response: " + str(r.status_code) + " " + r.reason) data = r.json() # This will return entire content. data['timestamp'] = epoch_time # Remove key's with complex JSON structure del data['cur_production_per_wrid'] del data['invEnergyType'] #del data['decimalseperator'] logging.debug(data) #write data to .json with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.json', 'w', encoding='utf-8') as outfile: json.dump(data, outfile, indent=4, ensure_ascii=False) #write the same data as .csv since it is more easy to handel with hdfs.. with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.csv', 'w') as f: # Just use 'w' mode in 3.x w = csv.DictWriter(f, data.keys(), dialect=csv.excel_tab) w.writeheader() w.writerow(data) # write the same data as .csv since it is more easy to handel with hdfs.. hdfs = PyWebHdfsClient(host='193.246.208.147', port='50079', user_name='hdfs') #hdfs_path = 'user/hdfs/from_python' hdfs.create_file('user/hdfs/pfadibaar_solarlog.csv', '0100') #with open('pfadibaar_solarlog_' + epoch_time + '.csv') as file_data: # hdfs.create_file(hdfs_path, data=file_data) conn.close()
def save_extracted_subgraph(elements, args: application_args): pair, subgraph, _ = elements path = args.get_folder_results_path() hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port) file = os.path.join(path, f"graph_{str(pair[0])}_{str(pair[1])}") pickled = pkl.dumps(subgraph) hdfs.create_file(file, pickled, overwrite=True)
def saveToStore(path,meta): con=happybase.Connection(MasterHbase) con.open() metaTable= con.table('MetaTable') if meta['size'] < largeSize: # save to Hbase encTable = con.table('EncTable') with open(path,'rb') as f: encTable.put(meta['rowkey'],{'enc:data': f.read()}) metaTable.put(str(meta['rowkey']),{ 'pp:name': str(meta['filename']), 'pp:checksum': str(meta['checksum']), 'pp:size': str(meta['size']), 'pp:often': str(meta['often']), 'pp:des': str(meta['description']) } ) app.logger.debug('%s is saved to Hbase',meta['rowkey']) else: # save to HDFS hdfs = PyWebHdfsClient(host=Master,port='50070', timeout=None,user_name='hduser') with open(path, 'rb') as f: hdfs.create_file(HDFSMainPath+meta['rowkey'], f) metaTable.put(str(meta['rowkey']),{ 'pp:name': str(meta['filename']), 'pp:checksum': str(meta['checksum']), 'pp:size': str(meta['size']), 'pp:HDFSpath': str(HDFSMainPath + meta['rowkey']), 'pp:often': str(meta['often']), 'pp:des': str(meta['description']) } ) app.logger.debug('%s is saved to HDFS',meta['rowkey']) con.close()
def to_hdfs(file_path, hdfs_path): hdfs = PyWebHdfsClient(host='hdfs-v1', port='50070', user_name='hdfs', timeout=100) with open(file_path, 'rb') as f: hdfs.create_file(hdfs_path, f, overwrite=True)
def downParts(fpath): '从hdfs下载spark输出的文件, 以 part-0??? 格式的系列文件, 下载的时候直接聚合到一个文件中' from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='iasp76', port='12003', user_name='mci') flist = hdfs.list_dir(fpath) x = flist['FileStatuses']['FileStatus'] _SUCCESS = False for f in x: if f['pathSuffix'] == '_SUCCESS': _SUCCESS = True break if not _SUCCESS: print("not complete yet!") return fnames = [ f['pathSuffix'] for f in x if f['pathSuffix'].startswith('part-') ] fnames1 = sorted(fnames) foutname = fpath[fpath.rfind('/') + 1:] l = len(fnames1) with open(foutname, "wb") as fo: for fname in fnames1: fpath1 = fpath + "/" + fname fo.write(hdfs.read_file(fpath1)) print(" progress: ", fname, l)
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.path = 'user/hdfs' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = [ '/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar', '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar' ] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) logging.info('Copying source file: %s to HDFS path %s', path, platform_file) with open(path) as file_data: hdfs_client.create_file(platform_file, file_data, overwrite=True)
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatus": { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } } self.response.json = MagicMock(return_value=self.file_status)
def __init__(self, host, port, user, logger): self._hdfs = PyWebHdfsClient( host=host, port=port, user_name=user, timeout=None) global LOGGER LOGGER = logger LOGGER.debug('webhdfs = %s@%s:%s', user, host, port)
def load(self, job, task, fifo): self.job = job self.task = task self.fifo = fifo self.key = None self.script_proc = None self.decompress_obj = None self.pycurl_callback_exception = None if task.data['scheme'] == 's3': self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None if self.is_anonymous: s3_conn = S3Connection(anon=True) else: s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key) bucket = s3_conn.get_bucket(task.data['bucket']) try: self.key = bucket.get_key(task.data['key_name']) except S3ResponseError as e: raise WorkerException( "Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name'])) elif task.data['scheme'] == 'hdfs': fname = task.data['key_name'] client = PyWebHdfsClient(job.spec.source.hdfs_host, job.spec.source.webhdfs_port, user_name=job.spec.source.hdfs_user) try: filesize = client.get_file_dir_status( fname)['FileStatus']['length'] except pywebhdfs.errors.FileNotFound: raise WorkerException("File '%s' does not exist on HDFS" % fname) self.key = AttrDict({'name': fname, 'size': filesize}) elif task.data['scheme'] == 'file': globber = glob2.Globber() fname = globber._normalize_string(task.data['key_name']) if not os.path.exists(fname): raise WorkerException( "File '%s' does not exist on this filesystem" % fname) elif not os.path.isfile(fname): raise WorkerException("File '%s' exists, but is not a file" % fname) self.key = AttrDict({ 'name': fname, 'size': os.path.getsize(fname) }) else: raise WorkerException('Unsupported job with paths: %s' % [str(p) for p in self.job.paths]) if self.key is None: raise WorkerException( 'Failed to find key associated with task ID %s' % task.task_id) self.metrics = DownloadMetrics(self.key.size)
def test_create_throws_exception_for_not_created(self, mock_put): webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.BAD_REQUEST mock_put.side_effect = [self.init_response, self.response] with self.assertRaises(errors.PyWebHdfsException): webhdfs.create_file(self.path, self.file_data)
def webhdfs_connect(): webhdfs = PyWebHdfsClient(base_uri_pattern=cfg['DEFAULT']['HDFS_BASEURL'], request_extra_opts={ 'verify': cfg['DEFAULT'].get('HDFS_CERT', None), 'auth': get_auth() }) return webhdfs
def get_file_contents(self, hdfs_path, user_name='trifacta', httpfs_port='14000'): hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc, port=httpfs_port, user_name=user_name) return hdfs.read_file(hdfs_path).decode('utf-8')
def getHdfsClient(self): try: hdfs = PyWebHdfsClient(host=self.server, port=self.port, user_name=self.user) except: etype, evalue, etb = sys.exc_info() self.logger.error('Could not connect to webfs service on %s. Exception: %s, Error: %s.' % (self.getConfigurationValue('server'), etype, evalue)) return None return hdfs
def from_hdfs(hdfs_path, file_path): hdfs = PyWebHdfsClient(host='hdfs-v1', port='50070', user_name='hdfs', timeout=100) binary_file = hdfs.read_file(hdfs_path) with open(file_path, 'wb') as f: f.write(binary_file)
def test_webhdfs_csv(self): from pywebhdfs.webhdfs import PyWebHdfsClient dfs = PyWebHdfsClient(host='localhost',port='9870', user_name='hadoop') dfs.make_dir("/temp") with open("tests/data/data.csv") as input_file: dfs.create_file("/temp/data.csv", file_data=input_file, overwrite=True) dfs.delete_file_dir("/temp", recursive=True)
def test_init_args_provided(self): host = '127.0.0.1' port = '50075' user_name = 'myUser' webhdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.assertEqual(host, webhdfs.host) self.assertEqual(port, webhdfs.port) self.assertEqual(user_name, webhdfs.user_name)
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir'
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.xattr = 'user.test' self.response = MagicMock()
def hdfs_client(ic): pywebhdfs = importorskip('pywebhdfs') if ic._nn_host is None: skip("NAMENODE_HOST not set; skipping...") from pywebhdfs.webhdfs import PyWebHdfsClient hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port, user_name=ic._hdfs_user) return hdfs_client
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs' self.file_data = u'010101' self.response = MagicMock() self.response.content = self.file_data
def test_create_returns_file_location(self, mock_put): webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.CREATED mock_put.side_effect = [self.init_response, self.response] result = webhdfs.create_file(self.path, self.file_data) self.assertTrue(result) mock_put.assert_called_with( self.location, headers=self.expected_headers, data=self.file_data)
def put_file_contents(self, hdfs_path, file_contents, user_name='trifacta', httpfs_port='14000'): hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc, port=httpfs_port, user_name=user_name) hdfs.create_file(hdfs_path, file_contents, overwrite=True) return True
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.path = 'user/hdfs/old_dir' self.new_path = '/user/hdfs/new_dir' self.response = MagicMock() self.rename = {"boolean": True} self.response.json = MagicMock(return_value=self.rename)
def classify(self, inputJson): pass self.hdfs = PyWebHdfsClient(host=self.config.acm.servers.hdfs.host,port=self.config.acm.servers.hdfs.restPort, user_name=self.config.acm.servers.hdfs.fileOwner) self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port) if hasattr(self, 'sc')==False: self.sc =SparkContext() if hasattr(self, 'sqlContext')==False: self.sqlContext = SQLContext(self.sc) schema = StructType([StructField('Category', StringType(), True), StructField('Descript', StringType(), True), StructField('Dates', StringType(), True), StructField('DayOfWeek', StringType(), True), StructField('PdDistrict', StringType(), True), StructField('Resolution', StringType(), True), StructField('Address', StringType(), True), StructField('X', DoubleType(), True), StructField('Y', DoubleType(), True) ]) test = self.sqlContext.createDataFrame(inputJson, schema) #pipeline= PipelineModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.pipeline.savepoint") pipeline= PipelineModel.load(self.pipelineHdfsPath) testData = pipeline.transform(test) print("Test Dataset Count: " + str(testData.count())) ########################################################## ################## Train/load the model ################## ########################################################## #lrModel = LogisticRegressionModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.savepoint") lrModel = LogisticRegressionModel.load(self.modelHdfsPath) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 7) \ .select("Descript","Category","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) #.select("probability","label","prediction") \ resultJson = predictions.filter(predictions['prediction'] == 7) \ .select("prediction") \ .orderBy("probability", ascending=False) \ .toJSON().collect() self.sc.stop() return ["al sana ML!", resultJson]
def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'): ''' This function use to establish a connection to hdfs, for preparing to create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/) :param : host - hdfs rest host :param : port - hdfs rest running port :param : user_name - hdfs username (for authentication) :param : hdfs_path - location to store files. (default: '/tmp/') :return: Nothing. ''' self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.hdfs_path = hdfs_path