def handleHdfsDownload(hdfs_path, local_path): client = InsecureClient("http://hdfs.neurolearn.com:50070", user="******") client.download(hdfs_path, local_path, overwrite=True) print('Downloaded Images from HDFS.') return local_path
import os import pandas as pd os.environ[ 'WEBHDFS_URI'] = 'http://172.17.0.2:50070' # A NA PAS FAIRE, POUR TESTS EN LOCAL UNIQUEMENT source_csv = '/user/root/test.csv' content = '"eva","date","duration","country","purpose","crew"\n' content += '"1","1965-06-03T00:00:00","0:36","USA","First U.S. EVA. Used HHMU and took photos.","Ed White"\n' content += '"9","1966-11-13T00:00:00","2:06","USA","Attached tether between Agena and Gemini.","Buzz Aldrin"\n' # Etape 0 : ecriture du fichier pour le test # - Instance de client pointant sur un HDFS en particulier source_hdfs_url = os.environ['WEBHDFS_URI'] source_hdfs = InsecureClient(url=source_hdfs_url, user='******') with source_hdfs.write(source_csv, append=False, overwrite=True, encoding='utf-8') as writer: writer.write(content) ################################################################################ # Etape 1 : lecture du fichier # - Reutilisation du client HDFS with source_hdfs.read(source_csv, encoding='utf-8') as reader: data = pd.read_csv(reader, index_col=0) data = data.ix[:, ['date', 'country']]
from hdfs import InsecureClient from pyspark.sql import SparkSession from json import dump import json client_hdfs = InsecureClient('http://localhost:9870', user='******') # spark = SparkSession \ # .builder \ # .master("local") \ # .appName("Protob Conversion to Parquet") \ # .config("spark.some.config.option", "some-value") \ # .getOrCreate() spark = SparkSession \ .builder \ .appName("myApp") \ .config("spark.mongodb.input.uri", "mongodb://localhost:27017/numtest.kafka") \ .config("spark.mongodb.output.uri", "mongodb://localhost:27017/numtest.kafka") \ .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \ .getOrCreate() mongo = spark.read.format("com.mongodb.spark.sql.DefaultSource").option( "spark.mongodb.input.uri", 'mongodb://localhost:27017/numtest.kafka').load() results = mongo.toJSON().map(lambda j: json.loads(j)).collect() mongo.show() with client_hdfs.write('/home/hadoop/hdfs/helloworld2.json', encoding='utf-8') as writer: dump(results, writer)
from __future__ import print_function # In python 2.7 from flask import Flask, request import subprocess import logging import os from hdfs import InsecureClient import shutil app = Flask(__name__) app.config['DEBUG'] = True app.url_map.strict_slashes = False hdfsLocation = "hdfs:50070" client_hdfs = InsecureClient('http://' + hdfsLocation) def debug(string): app.logger.debug(string) @app.route('/uploader', methods=['POST']) def upload(): #/sparkgis/sample_data/ loc = request.form.get('name') f = request.files['file'] f.save("tmp/file") client_hdfs.upload(str(loc), "../web/tmp/file")
import pandas from hdfs import InsecureClient import os from csv import reader client_hdfs = InsecureClient('http://localhost:50070') client_hdfs. with client_hdfs.read('/user/hdfs/wiki/helloworld.csv', encoding = 'utf-8',) as reader: df = pd.read_csv(reader,index_col=0) #with client_hdfs.write('/user/temp/repourldata.csv',encoding='utf-8') as writer: # df.to_csv(writer) print('SUCCESSFULL RUN')
import io # For Data Lake from hdfs import InsecureClient # For Data Warehouse from pyhive import hive import pandas as pd df_source = pd.read_csv(r'output/price.csv') # Define HDFS interface hdfs_interface = InsecureClient('http://localhost:50070') hdfs_interface.list('/') # Delete old data hdfs_interface.delete('/wqd7005/raw_price', recursive=True, skip_trash=True) # Create hdfs directories to store data hdfs_interface.makedirs('/wqd7005') hdfs_interface.makedirs('/wqd7005/raw_price') hdfs_interface.list('/wqd7005') # Write data to raw_price directory # text buffer s_buf = io.StringIO() # saving a data frame to a buffer (same as with a regular file): df_source.to_csv(s_buf, index=False, header=False) hdfs_interface.write('/wqd7005/raw_price/000000_0', data=s_buf.getvalue(),
def __init__(self, *opts): self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER'])
def __init__(self, from_directory, images_path, to_directory): self.from_directory = from_directory self.to_directory = to_directory self.images_path = images_path self.hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')
l = 0 r = 7 length = len(df) while (l < length): pd.io.sql.to_sql(df[l:r], 'opt_bi_range', yconnect, schema='ezp-opt', if_exists='append', index=False) l += 1 r += 1 #ę°ę®åå „hdfsäø from hdfs import InsecureClient hdfs_client = InsecureClient('http://cluster2-master:50070', user='******') if len(result.head(1)): pandas_df = result pandas_df.to_csv('/home/jifenduiquan.csv') dir_path = '/tmp' file_list = hdfs_client.list(dir_path) target_file = 'jifenduiquan.csv' append = True if target_file in file_list else False with hdfs_client.write("{}/{}".format(dir_path, target_file), append=append, encoding='utf-8') as writer: pandas_df.to_csv(writer, header=True) except: pass
def uploadHdfs(client, localPath): hdfsPath = localPath.replace("D://hdfs/", "/") # client.delete(hdfsPath) client.upload(hdfsPath, localPath, overwrite=True) pass if __name__ == '__main__': cf = configparser.ConfigParser() cf.read("config.ini") # hdfs = cf.get("hdfs", "hdfs_url") hdfs = cf.get("hdfs_test", "hdfs_url") client = InsecureClient(hdfs, user='******') # client = InsecureClient(hdfs, user='******') # client = InsecureClient(hdfs, user='******') # fl = client.list("/user/dev/euler/inneralgoframefile") # fl = client.list("/user/dev/boss/customerfile/2018/useralgorithm/") # fl = client.list("/user/dev/boss/customerfile/11/") # fl = client.list("/user/dev/boss/customerfile/56/324555663560933382/euler/file/transform/") # fl = client.list("/user/dev/boss/customerfile/2018/dataset") # fl = client.list("/user/dev/boss/customerfile/11/10/euler/trainView/319356072888631296/") fl = client.list("/user/dev/boss/customerfile") # fl = client.list("/user/root/euler/inneralgofile") # client.makedirs("/launcher/code/") print(fl) # hdfspath = "/user/dev/boss/customerfile/11/10/euler/trainView/315259954451775488/model/sk.pkl"
def get_hdfs_files(db_name, table_name): client = InsecureClient(cons.BASE_URL, user=cons.USER_NAME) content = client.walk(cons.HDFS_PATH + db_name + '.db/' + table_name, 1) return content
path = '/home/bda/data' logging.info("Write JSON and CSV to : " + path) with open(path + '/solarlog_' + str(site_id) + '_' + epoch_time_now + '.json', 'w', encoding='utf-8') as outfile: json.dump(solar_data, outfile, indent=4, ensure_ascii=False) #write the same data as .csv since it is more easy to handel with hdfs.. with open( path + '/solarlog_' + str(site_id) + '_' + epoch_time_now + '.csv', 'w') as f: w = csv.DictWriter(f, solar_data.keys(), dialect=csv.excel_tab) w.writeheader() w.writerow(solar_data) # write data to hdfs logging.info("Write csv to hdfs : /data/solarlog/") client = InsecureClient('http://nh-01.ip-plus.net:50070', user='******') with client.write('/data/solarlog/solarlog_' + str(site_id) + '_' + epoch_time_now + '.csv', encoding='utf-8') as writer: w = csv.DictWriter(writer, solar_data.keys(), dialect=csv.excel_tab) w.writeheader() w.writerow(solar_data) # Write to KAFKA kafka_produce(solar_data)
import os import sys import posixpath as psp from tensorflow.contrib import learn from tensorflow.contrib.learn.python import SKCompat as skflow from sklearn.metrics import mean_squared_error from hdfs3 import HDFileSystem from lstm_predictor2 import lstm_model, load_csvdata, rnn_model, gru_model filename = "ptp_resultsRNNall.txt" results = open(filename, 'a') warnings.filterwarnings("ignore") working_dir = "/user/canast02/ystr=2016" from hdfs import InsecureClient hdfs = InsecureClient(url='http://pythia1.in.cs.ucy.ac.cy:50070', user='******') fnames = hdfs.list(working_dir) print(fnames) LOG_DIR = 'resources/logs/' TIMESTEPS = 1 RNN_LAYERS = [{'num_units': 16}, {'num_units': 16}] DENSE_LAYERS = [] TRAINING_STEPS = 1000 PRINT_STEPS = TRAINING_STEPS # / 10 BATCH_SIZE = 64 regressor = skflow( learn.Estimator(model_fn=rnn_model(TIMESTEPS, RNN_LAYERS, DENSE_LAYERS))) # model_dir=LOG_DIR)
from hdfs import InsecureClient import datetime client = InsecureClient('http://localhost:14000') def exists(path): return client.status(path, strict=False) != None def mkdir(path): print('MKDIR: ' + path) client.makedirs(path) def touch(path): print('TOUCH: ' + path) with client.write(path) as writer: writer.write('') def append(path, data): print('APPEND: ' + path) with client.write(path, append=True) as writer: writer.write(data + '\n') def write(path, data): print('WRITE: ' + path) with client.write(path) as writer: writer.write(data + '\n')
def _webhdfs(self): from hdfs import InsecureClient client = InsecureClient(f"http://{self.host}:{self.port}", self.user) yield client
def find_result(image_path, image_name): result = [] # risultato dell'object detection with open(image_path, 'rb') as f: data = f.read() # legge l'immagine di input channel = implementations.insecure_channel('0.0.0.0', 4001) # crea un canale non sicuro con l'host, numero di porta 4001 # crea il servizio di prediction che permette di accedere ai modelli caricati nel model server stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) request = predict_pb2.PredictRequest() # richiesta da mandare al server # PredictRequest specifica quale modello TensorFlow eseguire, quali sono i tensori di input e come sono filtrati # gli output prima di essere restituiti all'utente request.model_spec.name = 'coco_model' request.model_spec.signature_name = 'detection_signature' # signature del modello request.inputs['inputs'].CopyFrom(tf.contrib.util.make_tensor_proto(data, shape=[1])) # tensori di input res = stub.Predict(request, 10.0) # risultati della richiesta di prediction, 10 secs timeout scores = res.outputs['detection_scores'].float_val # score degli oggetti trovati in ordine decrescente classes = res.outputs['detection_classes'].float_val # id delle classi trovate, ordinate con score decrescente # print zip(classes, scores) # vettore con la posizione normalizzata dei bounding box dell'immagine: ymin, xmin, ymax, xmax # i bounding box sono ordinati dal bbx dell'oggetto con score maggiore boxes = res.outputs['detection_boxes'].float_val # trasformo il vettore in modo che ogni elemento sia una quadrupla che identifica il bounding box boxes = np.reshape(boxes, [100, 4]) # per salvare l'immagine con i bounding box, dobbiamo aprire l'immagine e sfruttare la libreria vis_util di tensorflow im = imageio.imread(image_path) # legge l'immagine come un array multidimensionale label_map_path = "Label_maps/mscoco_label_map.pbtxt" label_map = label_map_util.load_labelmap(label_map_path) # mappa id-label categories = label_map_util.convert_label_map_to_categories(label_map=label_map, max_num_classes=90) # lista di dizionari category_index = label_map_util.create_category_index(categories) # dizionario coppie chiave ("id"), valore ("nome classe") # viene creato un array (img_height, img_width, 3) con i bounding box sovrapposti image_vis = vis_util.visualize_boxes_and_labels_on_image_array( im, boxes, np.squeeze(classes).astype(np.int32), np.squeeze(scores), category_index, max_boxes_to_draw=10, # num max di bounding box da visualizzare min_score_thresh=.2, # soglia minima dei bounding box da visualizzare use_normalized_coordinates=True, line_thickness=5) # larghezza linea del contorno dei box imageio.imwrite("Images_bbx/{}_coco.jpg".format(image_name), image_vis) # salva l'array come un'immagine JPEG client_hdfs = InsecureClient('http://localhost:50070') # client per accedere al HDFS #tramite il modulo socket verifichiamo se la porta dell'hdfs e' connessa sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) #crea un nuovo socket port_result = sock.connect_ex(('localhost', 50070)) # se la porta e' attiva restituisce 0, altrimenti un valore diverso da 0 if port_result == 0: # se l'HDFS e' connesso, vi sposto l'immagine con i bounding box predetti client_hdfs.upload('/zora-object-detection/images/{}_coco.jpg'.format(image_name),"Images_bbx/{}_coco.jpg".format(image_name)) os.remove("Images_bbx/{}_coco.jpg".format(image_name)) vowels = ("a", "e", "i", "o", "u") bbx_coco = [] # vettore con le coordinate dei bounding box trovati con il COCO Model # se lo score dell'oggetto con score piu' alto nell'immagine e' maggiore o uguale a 0.6, # si effettua la detection e si costruisce la stringa da passare al robot if (scores[0] >= 0.6): labels_coco = [] # vettore delle labels trovate con il COCO Model labels_pets = [] # vettore delle labels trovate con il Pets Model labels_people = [] # vettore delle labels trovate con il People Model bbx_pets = [] # coordinate dei bounding box relative al Pets Model bbx_people = [] # coordinate dei bounding box relative al modello people boxes=boxes.tolist() # trasforma l'array multidimensionale in lista j = 0 for i in range(0, len(classes)): # consideriamo tutte le labels con uno score >= 0.6 e i cui bounding box non corrispondono a quelli gia' # inseriti con score piu' alto if (scores[i] >= 0.6 and boxes[i] not in bbx_coco): labels_coco.append(str(category_index[int(classes[i])]['name'])) bbx_coco.append(boxes[i]) j = j + 1 # se tra le labels ci sono persone si cerca il sesso con un object detection tramite il People Model if ("person" in labels_coco): n_people = labels_coco.count("person") labels_people, bbx_people = pets_people_detection.find_labels(image_path, image_name, stub, request, "people_model", n_people) # se tra le labels ci sono cani o gatti si cercano le razze con un object detection tramite il Pets Model if ("cat" in labels_coco or "dog" in labels_coco): n_cat = labels_coco.count("cat") # numero di gatti n_dog = labels_coco.count("dog") # numero di cani n_pets = n_cat + n_dog # labels e bounding box trovati con il pets model labels_pets, bbx_pets = pets_people_detection.find_labels(image_path, image_name, stub, request, "pets_model", n_pets) # dizionario avente come chiavi i nomi delle labels e valori le rispettive occorrenze in labels_coco counter = Counter(labels_coco) counter = list(counter.items()) # converte il dizionario in una lista di coppie con nome della label e occorrenza #costruisco la stringa "string" da passare al robot string = "I see " for i in range(0, len(counter)): # la lunghezza del counter corrisponde al numero di labels diverse if counter[i][1] == 1: # se la classe ha solo un'occorrenza if (counter[i][0].startswith(vowels)): # se il nome della classe inizia per vocale string += "an " + counter[i][0] + ", " else: string += "a " + counter[i][0] + ", " else: # se la classe ha piu' occorrenze s = ["sheep", "scissors", "skis"] es = ["sandwich", "toothbrush", "wine glass", "bus", "bench", "couch"] if (counter[i][0] in s): string += str(counter[i][1]) + " " + counter[i][0] + ", " elif (counter[i][0] in es): string += str(counter[i][1]) + " " + counter[i][0] + "es, " elif (counter[i][0] == "knife"): string += str(counter[i][1]) + " knives, " elif (counter[i][0] == "person"): string += str(counter[i][1]) + " people, " else: string += str(counter[i][1]) + " " + counter[i][0] + "s, " string = string.rstrip(", ") # se la frase termina con una virgola, devo trascurare quest'ultima if ("," in string): k = string.rfind(",") string = string[:k] + " and" + string[k + 1:] # sostituisce l'ultima virgola della frase con "and" string += "!" if (labels_people != []): # se trovo il sesso della/e persona/e n_man = labels_people.count("man") n_woman = labels_people.count("woman") if len(labels_people) == 1 and n_people == 1: # nell'immagine c'e' una persona if(n_man == 1): string += " He is a man" else: string += " She is a woman" elif len(labels_people) == 1 and n_people > 1: # nell'immagine ci sono piu' persone ma ho riconosciuto il sesso di una string += " A person is a " + labels_people[0] else: # ho riconosciuto il sesso di piu' persone if n_man > 1 and n_woman == 0: string += " There are " + str(n_man) + " men" elif n_man == 0 and n_woman > 1: string += " There are " + str(n_woman) + " women" elif n_man == 1 and n_woman == 1: string += " There is a man and a woman" elif n_man > 1 and n_woman == 1: string += " There is a woman and " + str(n_man) + " men" elif n_man == 1 and n_woman > 1: string += " There is a man and " + str(n_woman) + " women" else: string += " There are " + str(n_man) + " men and " + str(n_woman) + " women" string += "." if (labels_pets != []): # se trovo le razze dei cani e/o dei gatti cat_breeds = [] dog_breeds = [] for i in range(0,len(labels_pets)): # separo le razze in due liste in base all'iniziale del nome: maiuscolo per la razza di un gatto e # minuscolo per la razza di un cane if labels_pets[i][0].isupper(): cat_breeds.append(labels_pets[i]) else: dog_breeds.append(labels_pets[i]) if (n_cat == 1 and n_dog == 0): # nell'immagine c'e' un gatto ma non ci sono cani pet_string = " The cat breed is " + labels_pets[0] elif (n_cat == 0 and n_dog == 1): # nell'immagine c'e' un cane ma non ci sono gatti pet_string = " The dog breed is " + labels_pets[0] elif (n_cat > 1 and n_dog == 0): # nell'immagine ci sono piu' gatti, ma non ci sono cani if (len(labels_pets) == 1): # ho trovato la razza di uno solo dei gatti pet_string = " The breed of a cat is " + labels_pets[0] else: # ho trovato la razza dei gatti pet_string = " The cat breeds are " for i in range(0, len(labels_pets)): pet_string += labels_pets[i] + ", " elif (n_cat == 0 and n_dog > 1): # nell'immagine ci sono piu' cani, ma non ci sono gatti if (len(labels_pets) == 1): # ho trovato la razza di un solo cane pet_string = " The breed of a dog is " + labels_pets[0] else: # ho trovato la razza dei cani pet_string = " The dog breeds are " for i in range(0, len(labels_pets)): pet_string += labels_pets[i] + ", " else: #nell'immagine ci sono sia cani che gatti if (len(labels_pets) == 1): # se con il Pets Model ho trovato solo il nome di una razza if not dog_breeds: # se non ci sono razze di cani if n_cat == 1: pet_string = " The cat breed is " + labels_pets[0] else: pet_string = " The breed of a cat is " + labels_pets[0] else: # se non ci sono razze di gatti if n_dog == 1: pet_string = " The dog breed is " + labels_pets[0] else: pet_string = " The breed of a dog is " + labels_pets[0] else: # ho trovato piu' razze if not cat_breeds and n_dog > 1: # se non ci sono razze di gatti e ci sono piu' cani nell'immagine pet_string = " The dog breeds are " elif not dog_breeds and n_cat > 1: # se non ci sono razze di cani e ci sono piu' gatti pet_string = " The cat breeds are " else: # se ci sono razze di cani e di gatti pet_string = " The dog and cat breeds are " for i in range(0, len(labels_pets)): pet_string += labels_pets[i] + ", " pet_string = pet_string.rstrip(", ") if ("," in pet_string): # sostituisco l'ultima virgola della stringa con "and" k = pet_string.rfind(",") pet_string = pet_string[:k] + " and" + pet_string[k + 1:] pet_string += "." string += pet_string result = [string] log_string = string + '\n' # stringa da passare al file di log salvato in hdfs # consideramo anche l'oggetto con score piu' alto compreso tra 0.5 e 0.6 se non coincide con un oggetto # gia' considerato if(scores[j]<0.6 and scores[j]>0.5 and boxes[j] not in bbx_coco): class_name=str(category_index[int(classes[j])]['name']) bbx_coco.append(boxes[j]) if(class_name in labels_coco): result.append("Is there also another " + class_name + "?") log_string += "Maybe there is also another " + class_name + ".\n" else: if class_name.startswith(vowels): result.append("Is there also an " + class_name + "?") log_string += "Maybe there is also an " + class_name + ".\n" else: result.append("Is there also a " + class_name + "?") log_string += "Maybe there is also a " + class_name + ".\n" log_string = add_bbx_log(log_string, bbx_coco, '{}_coco.jpg'.format(image_name)) if(labels_people != []): log_string = add_bbx_log(log_string, bbx_people, '{}_people.jpg'.format(image_name)) if (labels_pets != []): log_string = add_bbx_log(log_string, bbx_pets, '{}_pets.jpg'.format(image_name)) # se la label con lo score piu' alto ha score compreso tra 0.2 e 0.6, inserisco in un vettore tutte le labels # con lo score compreso tra queste soglie elif (scores[0] < 0.6 and scores[0] > 0.2): # stringa da inserire nel file di log con le ipotesi su possibili oggetti presenti nell'immagine log_string = "I'm not sure what's in the picture.\nMaybe there is" for i in range(0, len(classes)): if (scores[i] < 0.6 and scores[i] > 0.2): class_name = str(category_index[int(classes[i])]['name']) if (class_name not in result): result.append(class_name) bbx_coco.append(boxes[i]) if class_name.startswith(vowels): log_string += " an " + class_name + "," else: log_string += " a " + class_name + "," log_string = log_string.rstrip(", ") if ("," in log_string): # sostituisco l'ultima virgola della stringa con "or" k = log_string.rfind(",") log_string = log_string[:k] + " or" + log_string[k + 1:] log_string += ".\n" log_string = add_bbx_log(log_string, bbx_coco, '{}_coco.jpg'.format(image_name)) # se la label con lo score piu' alto ha uno score <= 0.2 passo al robot il vettore "result" nullo else: log_string = "I don't know what's in the picture!" if port_result == 0: # se l'HDFS e' connesso, creo il file di log con la stringa with client_hdfs.write('/zora-object-detection/logs/{}.log'.format(image_name)) as writer: writer.write(log_string) return result
from rpy2 import robjects from git_clone import git_clone from hdfs import InsecureClient import shutil # Next Round print("Hello again") # Check if there is data for a prediction client_hdfs = InsecureClient('http://awscdh6-ma.sap.local:9870', user='******') hdfs_content = client_hdfs.list('/tmp/tbr/BARMER/XSA') print(hdfs_content) print() if len(hdfs_content) > 0 and hdfs_content[0] == 'iris.csv': print('Starte Prediction') #Herkunft des R-Scripts source_path = 'https://github.com/JimKnopfSun/BARMER_XSA.git' #Ziel des R-Scripts auf XSA target_path = '/usr/sap/HN2/home/testdir/' #Leere alte Script-Downloads im XSA shutil.rmtree(path=target_path + "/BARMER_XSA", ignore_errors=True, onerror=None) #Lade R-Script nach XSA git_clone(source_path, target_path)
import pandas as pd from io import StringIO from hdfs import InsecureClient , HdfsError client = InsecureClient('http://datalake:50070') def read_file(msg): attr = dict() api.send("outData",api.Message(attributes=attr, body= 'Param Loading Triggered')) sdl_file = '/shared/SLT/SFLIGHT/data.csv' with client.read(sdl_file , encoding='utf-8') as reader: df = pd.read_csv(reader) api.send("outData",api.Message(attributes=attr, body= str(df_key))) api.send("out2",str(df_key)) api.set_port_callback("input", read_file)
#export feature class to hdfs import arcpy import pandas as pd from hdfs import InsecureClient import os arcpy.env.workspace = r'C:\folder\data.gdb' fc = r'Sample' field_names = [f.name for f in arcpy.ListFields(fc)] Vars = ['OBJECTID', 'Param1', 'Param2', 'Param3'] spatRef = arcpy.Describe(fc).spatialReference data = arcpy.da.FeatureClassToNumPyArray(fc, ["SHAPE@X", "SHAPE@Y"] + Vars) df = pd.DataFrame(data) client_hdfs = InsecureClient('http://127.0.0.1:50070') with client_hdfs.write('/user/root/folder/data.csv', encoding='utf-8') as writer: df.to_csv(writer)
import os import time from hdfs.client import Client from hdfs import InsecureClient # RootPath = 'D:\\aaa' from hdfs3 import HDFileSystem # hdfs = HDFileSystem("http://192.168.1.95", port=50070) # print(hdfs.ls('/user/data')) RootPath = '/home/qinyu/myproject/python_project/examination_pro/hdfscontroll/inventory' HdfsRootPath = '/qjb' LogPath = '/home/qinyu/myproject/python_project/examination_pro/hdfscontroll/log/' # client = Client("http://192.168.1.95:50070", timeout=1000) client = InsecureClient(url="http://192.168.1.95:50070", user="******") def savelog(logPath, log): if not os.path.exists(logPath): os.makedirs(logPath) stamp = int(time.time()) logName = time.strftime("%Y%m%d_%H%M", time.localtime(stamp)) + '.log' logFilePath = os.path.join(logPath, logName) file = open(logFilePath, 'w') file.write(log + '\n') file.close() # for i in client.list('/'): # print(i)
if args.json: json_writer.writeJSONPSMSfromArchive(archivePath, jsonPath) else: log.warning("No files downloaded!") except Exception as err: log.error("Exception {}".format(err)) error = err if csvs: log.info("CSVs generated during execution!") hdfs_parent = os.path.join(os.sep, os.environ['HDFS_USER'], str(row.csv_generator_job_id)) for csv in csvs: hdfs_path = os.path.join(hdfs_parent, csv.split(os.sep)[-1]) # put csv into hdfs #put = Popen(["hadoop", "fs", "-put", csv, hdfs_path], stdin=PIPE, bufsize=-1) #put.communicate() client = InsecureClient('{}:{}'.format(os.environ['HDFS_HOST'], os.environ['HDFS_PORT']), user=os.environ['HDFS_USER']) client.upload(hdfs_path, csv) session.execute("UPDATE CSV_GENERATOR_JOBS SET STATUS = 1, CSV_HDFS_PATH={}, PRIDE_ID={}, JOB_RESULT_MESSAGE=\'{}\' WHERE CSV_GENERATOR_JOB_ID={} IF EXISTS;".format(hdfs_parent, projects, "success", str(row.csv_generator_job_id))) else: log.warning("No CSVs generated during execution!") session.execute("UPDATE CSV_GENERATOR_JOBS SET STATUS = -1, JOB_RESULT_MESSAGE=\'{}\' WHERE CSV_GENERATOR_JOB_ID={} IF EXISTS;".format(str(error), str(row.csv_generator_job_id))) log.info('Sleeping for {} seconds!'.format(os.environ['TIMEOUT'])) time.sleep(int(os.environ['TIMEOUT']))
import pyspark.sql.types as t from hdfs import InsecureClient from pyspark import SparkContext from pyspark.sql import SparkSession from schemas.omdb import schema_actors from udfs import ( general_awards_by_keyword, nominated_by_keyword, omdb_data, won_by_keyword, ) os.environ["HADOOP_USER_NAME"] = HADOOP_USER_NAME client_hdfs = InsecureClient(f"http://{HADOOP_NAMENODE}:50070", user=HADOOP_USER_NAME) # get preprocessed opusdata filename hdfs_path = "/processed/opusdata.csv" filename = [f for f in client_hdfs.list(hdfs_path) if f.endswith(".csv")][0] sc = SparkContext(SPARK_URI) parent_dir = os.path.dirname(os.path.abspath(__file__)) sc.addPyFile(os.path.join(parent_dir, "utils.py")) sparkSession = ( SparkSession.builder.appName("preprocessing-opusdata-and-omdb").config( "spark.hadoop.dfs.client.use.datanode.hostname", "true").getOrCreate())
def __init__(self, bucket_name: str, folder_name: str): super().__init__(bucket_name, folder_name) self.client = InsecureClient(url=settings.HDFS_CONN, user=settings.HDFS_USERNAME)
def remove_in_hdfs(hdfs_path): client = InsecureClient('http://quickstart.cloudera:50070', user='******') client.delete(hdfs_path, recursive=True)
from hdfs import InsecureClient hdfs_cli = InsecureClient('http://192.168.1.4:9870', user='******') hdfs_cli.delete('/images', recursive=True) hdfs_cli.delete('/images_augmented', recursive=True) hdfs_cli.delete('/images_crop', recursive=True) hdfs_cli.delete('/images_norm', recursive=True) hdfs_cli.delete('/image_test', recursive=True) hdfs_cli.delete('/image_test_crop', recursive=True) hdfs_cli.delete('/image_test_ready', recursive=True) hdfs_cli.delete('/algo_trained', recursive=True)
def get_hdfs_client(): return InsecureClient("http://192.168.2.109:50070", user="******", root="/")
def __init__(self, namenode="localhost", port="50070"): self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port) print namenode, ">>", port, ">>", self.WEB_HDFS_URL self.client = InsecureClient(self.WEB_HDFS_URL)
def hdfs_client(): # TODO: Configure from env return InsecureClient('http://%s:50070' % master_ip(), user=config.HDFS_USER)
def get_hdfs_client(): return InsecureClient(get_hdfs_address(), root='/')
def stream(string, lines, t): """ Stream tweets from twitter and save them to file every hour Args: lines - array of streaming words t - Twarc class Returns: boolean - True (OK) / False (Error) """ words = lines string = string hour_keywords = {} # make timestamps timestr = time.strftime("%Y-%m-%d_%H-%M-%S") datestr = time.strftime("%Y-%m-%d") # get total time for check time start_time = time.time() # create directories and files for keywords tweets_to_write = {} indexes = {} client = InsecureClient('http://192.168.1.12:50070', user='******') for word in words: dir_word = word.replace(" ", "_") # for statistics if not os.path.isdir("data/statistics"): os.makedirs("data/statistics") # for statistics date if not os.path.isdir("data/statistics/" + datestr): os.makedirs("data/statistics/" + datestr) # for keyword if not os.path.isdir("data/" + dir_word): os.makedirs("data/" + dir_word) # for date if not os.path.isdir("data/" + dir_word + "/" + datestr): os.makedirs("data/" + dir_word + "/" + datestr) # create json file for writing data with open("data/" + dir_word + "/" + datestr + "/" + timestr + ".json", "w") as fw: fw.write("[") tweets_to_write[dir_word] = [] indexes[dir_word] = 0 minutes = 1 while True: try: # find lines in twitter print "String query: %s" % string for tweet in t.stream(string): # regex to find keyword for word in words: dir_word = word.replace(" ", "_") filename = "data/" + dir_word + "/" + datestr + "/" + timestr # create list of words in keyword wlist = word.split() # length of this list w_length = len(wlist) check = 0 # for every word in keyword for w in wlist: # check if word is in tweet keyword = re.search("%s" % w, tweet["text"], re.IGNORECASE) if keyword: check += 1 # if every word from keyword is in tweet, save to file if check == w_length: print "Tweet language: %s" % tweet['lang'] if tweet['lang'] in languages: dumped_json = json.dumps(tweet) tweets_to_write[dir_word].append(dumped_json) with open(filename + ".json", "a") as fw: fw.write(dumped_json) fw.write(",") # counting total if word in total_keywords: total_keywords[word] += 1 else: total_keywords[word] = 1 # counting hourly if word in hour_keywords: hour_keywords[word] += 1 else: hour_keywords[word] = 1 if len(tweets_to_write[dir_word]) % 10 == 0: print "Goint to write into %s_%s" % ( filename, indexes[dir_word]) with client.write(filename + "_" + str(indexes[dir_word]), encoding='utf-8') as writer: writer.write("\n".join(tweets_to_write)) indexes[dir_word] = indexes[dir_word] + 1 tweets_to_write[dir_word] = [] # exit every hour and start function again if start_time + 3600 < time.time(): for word in words: dir_word = word.replace(" ", "_") with open( "data/" + dir_word + "/" + datestr + "/" + timestr + ".json", "a+") as fw: fw.seek(-1, os.SEEK_END) if fw.read() == ",": fw.seek(-1, os.SEEK_END) fw.truncate() fw.write("]") # hour statistics with open( "data/statistics" + "/" + datestr + "/" + timestr + ".txt", "w") as fw: for word in hour_keywords: fw.write( str(word) + " : " + str(hour_keywords[word]) + "\n") # total statistics with open("data/statistics/statistics.txt", "w") as fw: for word in total_keywords: fw.write( str(word) + " : " + str(total_keywords[word]) + "\n") return True # except for quit application except KeyboardInterrupt: for word in words: dir_word = word.replace(" ", "_") with open( "data/" + dir_word + "/" + datestr + "/" + timestr + ".json", "a+") as fw: fw.seek(-1, os.SEEK_END) if fw.read() == ",": fw.seek(-1, os.SEEK_END) fw.truncate() fw.write("]") # hour statistics with open( "data/statistics" + "/" + datestr + "/" + timestr + ".txt", "w") as fw: for word in hour_keywords: fw.write( str(word) + " : " + str(hour_keywords[word]) + "\n") # total statistics with open("data/statistics/statistics.txt", "w") as fw: for word in total_keywords: fw.write( str(word) + " : " + str(total_keywords[word]) + "\n") sys.stdout.write("QUIT\n") sys.exit(0) # except for problems with key except KeyError: # exit every hour and start function again if start_time + 3600 < time.time(): for word in words: dir_word = word.replace(" ", "_") with open( "data/" + dir_word + "/" + datestr + "/" + timestr + ".json", "a+") as fw: fw.seek(-1, os.SEEK_END) if fw.read() == ",": fw.seek(-1, os.SEEK_END) fw.truncate() fw.write("]") # hour statistics with open( "data/statistics" + "/" + datestr + "/" + timestr + ".txt", "w") as fw: for word in hour_keywords: fw.write( str(word) + " : " + str(hour_keywords[word]) + "\n") # total statistics with open("data/statistics/statistics.txt", "w") as fw: for word in total_keywords: fw.write( str(word) + " : " + str(total_keywords[word]) + "\n") return True continue # error return False