class HdfsWrapper: def __init__(self): self.client = None def connect_hdfs(self): self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER) def mkdir_hdfs(self, path): if not exists(path): self.client.makedirs(path) def list_hdfs(self, path): return self.client.list(path) def read_hdfs(self, hdfs_path): try: with self.client.read(hdfs_path) as reader: return reader.read() except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def write_hdfs(self, hdfs_path, data, overwrite=False): try: with self.client.write(hdfs_path, overwrite=overwrite) as writer: writer.write(data) return hdfs_path except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def delete_hdfs(self, hdfs_path, recursive=False): return self.client.delete(hdfs_path, recursive)
def main(): client = InsecureClient(f'http://127.0.0.1:50070/', user='******') # create directory in HDFS client.makedirs('/test') #list content ll = client.list('/') print(ll) # create file in HDFS data = [{ "name": "Anne", "salary": 10000 }, { "name": "Victor", "salary": 9500 }] with client.write('/test/sample_file.json', encoding='utf-8') as json_file_in_hdfs: json.dump(data, json_file_in_hdfs) # OR client.write(os.path.join('/', 'test', 'sample_file2.json'), data=json.dumps(data), encoding='utf-8') # download file from HDFS client.download('/test/sample_file.json', './file_from_hadoop.json') # upload file to HDFS client.upload('/test/local_file_in_hadoop.json', './file_from_hadoop.json')
def uploadHDFS(filename): # if ' ' in filename: # aoi_file = rename(aoi_file) # else: aoi_file = filename client = InsecureClient('http://10.41.158.65:50070', user='******') fname1 = client.list(hdfs_path) # if aoi_file.split("@").count('-') >= 2: dt = str(aoi_file.split("@")[1].split("-", 1)[0][:8]) # else: # dt = str(aoi_file.split("_")[-1].split("-")[-2].split("@")[1][:8]) # dt = str(aoi_file.split("@")[1].split("-",1)[0][:8]) folder1 = dt if folder1 in fname1: client.upload(hdfs_path + folder1 + "/" + aoi_file, upload_path + aoi_file, overwrite=True) print "uploadHDFS ok" shutil.move(upload_path + aoi_file, backup_path + aoi_file) else: client.makedirs(hdfs_path + folder1) client.upload(hdfs_path + folder1 + "/" + aoi_file, upload_path + aoi_file, overwrite=True) print "uploadHDFS ok" shutil.move(upload_path + aoi_file, backup_path + aoi_file)
class DataProcessor: def __init__(self, data_path=None): if data_path is None: self.data_path = r'./config/connect_info.json' else: assert type(data_path) == str self.data_path = data_path if not os.path.exists(self.data_path): self.data_path = r'./connect_info.json' with open(self.data_path) as data_file: data = json.load(data_file) self.hdfs_client = InsecureClient( url='http://' + data['namenode_url'] + ':' + str(data['port']), user=data['user'], root=data['root_path']) self.img_dir = data['img_dir'] if self.img_dir[-1] != '/': self.img_dir += '/' self.file_name = 1 def InitImgDir(self): try: list_rslt = self.hdfs_client.list(self.img_dir) if len(list_rslt) > 0: for name in list_rslt: file_path = self.img_dir + name self.hdfs_client.delete(file_path) except util.HdfsError: self.hdfs_client.makedirs(self.img_dir) print("Mkdir ...") return True def DataProcess(self, data, append=False, file_name=None): assert type(data) == str if file_name is None: file_name = self.img_dir + str(self.file_name) else: assert (type(file_name)) == str print("start writing...") start = time.time() self.hdfs_client.write(file_name, data, overwrite=True, replication=1, append=append) delta = time.time() - start print("writing complete, time delta is " + str(delta)) return True def Upload(self, remote_name, local_path): assert os.path.exists(local_path) remote_path = self.img_dir + remote_name self.hdfs_client.upload(remote_path, local_path, True) return True
def mkdir_hdfs(self): ip_address = self.ip_input.toPlainText() port_number = self.port_input.toPlainText() user_name = self.user_input.toPlainText() dir_name = self.dir_input.toPlainText() target_name = dir_name + '/' + self.mkdir_input.toPlainText() host_address = 'http://' + ip_address + ':' + port_number hadoop = InsecureClient(host_address, user_name) hadoop.makedirs(target_name)
class HDFSStorage(Storage): def __init__(self, bucket_name: str, folder_name: str): super().__init__(bucket_name, folder_name) self.client = InsecureClient(url=settings.HDFS_CONN, user=settings.HDFS_USERNAME) def setup(self) -> HDFSResource: super().setup() self.client.makedirs(f"{self.bucket_name}/{self.folder_name}") return HDFSResource( resource=f"hdfs:/{self.bucket_name}/{self.folder_name}/") def put_file(self, file_path: Union[str, Path], rename: Optional[str] = None) -> HDFSResource: if isinstance(file_path, Path): file_path = str(file_path) file_name = Path(file_path).name if not rename else rename # copy file to task directory if not file_path.startswith(str(self.local_dir)): file_path = shutil.copy(file_path, Path(self.local_dir, file_name)) try: self.client.upload( f"{self.bucket_name}/{self.folder_name}/{file_name}", file_path) except (gaierror, NewConnectionError): raise return HDFSResource( resource=f"hdfs:/{self.bucket_name}/{self.folder_name}/{file_name}" ) def get_file(self, data_file: str) -> str: if not data_file.startswith("hdfs:"): raise NotValidScheme( "Object file prefix is invalid: expected `hdfs:`") _, bucket_name, folder_name, file_name = data_file.split("/") file_path = Path(self.temp_dir, bucket_name, folder_name, file_name) if not file_path.is_file(): try: self.client.download(data_file, file_path) except Exception as err: print(err) return str(file_path) def remove_remote_dir(self, omit_files: List[str] = None) -> None: pass
def load_enedis(): client = InsecureClient('http://localhost:50070', user='******') client.makedirs('data') print(client.list('/user/cloudera')) # load 10 lignes client.upload( '/user/cloudera/data', '/home/fitec/projet_fil_rouge/source_des_données/data/consommation_elec_regions_2019_l10.json', overwrite=True)
def save(inp): logging.info('Start saving') client = InsecureClient('http://127.0.0.1:50070/', user='******') name = inp[0]['date'] path = f'/bronze/{datetime.now().strftime("%Y-%m-%d")}/' client.makedirs(path) client.write(f'{path}/out_of_stock.json', data=json.dumps(inp)) logging.info('Saving ok')
class SavedModelUploader(object): """upload a saved model to hadoop file system""" def __init__(self, url, user, base_path=""): self._logger = logging.getLogger(self.__class__.__name__) self._url = url self._user_ = user self._base_path = base_path self._client = InsecureClient(url, user) if not self._exist(base_path): self._mkdir(base_path) def _exist(self, path): if self._client.content(path, strict=False): return True else: return False def _mkdir(self, path): self._client.makedirs(path) def _del(self, path): self._client.delete(path, recursive=True) def _upload(self, local_path, hdfs_path): self._client.upload(hdfs_path, local_path) def _logging_progress(self, local_path, nbytes): msg = None if nbytes > 0: msg = "uploading: '{}' [{} bytes]".format(local_path, nbytes) else: msg = "uploading: '{}' [done]".format(local_path) self._logger.info(msg) def upload(self, local_model_path, overwrite=False): hdfs_model_path = self._base_path + '/' + basename(local_model_path) existed = self._exist(hdfs_model_path) if overwrite and existed: self._del(hdfs_model_path) elif not overwrite and existed: raise RuntimeError( "could not overwrite the model, already existed.") try: self._client.upload(self._base_path, local_model_path, progress=self._logging_progress) except HdfsError as e: self._logger.error(e) self._logger.info("model upload done")
def __init__(self, current_user): # 需要将current_user对象传进来否则会报错 self.login_name = current_user.login_name try: # 测试能否正常连接hdfs而已 c = InsecureClient("http://master:50070", self.login_name) # 默认尝试给用户创建一个文件夹 c.makedirs("/lake/usr/" + self.login_name) except Exception as error: self.is_connection = False else: self.this_connection = c
def handleHdfsUpload(file_path, proj_id, task_id): try: client = InsecureClient("http://hdfs.neurolearn.com:50070", user="******") hdfs_path = "/neurolearn/files/" + proj_id + "/results/" + task_id client.makedirs(hdfs_path) client.upload(hdfs_path, file_path) print('Uploaded Images to HDFS.') except Exception as e: print(e) hdfs_path = '' return hdfs_path
class HdfsDb(object): HOST = '192.168.71.156' PORT = 50070 USER = '******' HOST_URI = 'http://{0}:{1}'.format(HOST, PORT) def __init__(self): self.client = InsecureClient(self.HOST_URI, user=self.USER) @check_dir_path def list_dir(self, dir_path=None): """ 列出根目录 :return: """ dir_data = self.client.list(dir_path) return dir_data @check_dir_path def mk_dir(self, dir_path=None): self.client.makedirs(dir_path) def write_file(self, filename, data, dir_path=None): """ 写入文件 hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data') :param filename: :param data: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) self.client.write(file_path, str(data)) @check_dir_path def read_file(self, filename, dir_path=None): """ 读取文件数据 filedata = hd.read_file('README.txt', dir_path='/data') :param filename: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) with self.client.read(file_path, encoding='utf-8') as reader: for line in reader: yield line @check_dir_path def delete(self, filename, dir_path=None): file_path = '{0}/{1}'.format(dir_path, filename) self.client.delete(file_path)
def uploadHDFS(filename): aoi_file = filename client = InsecureClient('http://10.41.158.65:50070', user='******') fname1 = client.list(hdfs_path) dt = str(aoi_file.split("@")[1].split("_")[0][:8]) folder1 = dt if folder1 in fname1: client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True) shutil.move(upload_path + aoi_file, backup_path + aoi_file) else: client.makedirs(hdfs_path+folder1) client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True) shutil.move(upload_path + aoi_file, backup_path + aoi_file)
def copy_table_to_hdfs(**kwargs): client = InsecureClient(f'http://127.0.0.1:50070/', user='******') logging.info(f"Creating dir /bronze on hadoop") client.makedirs('/bronze') _table_name = kwargs['table_name'] pg_hook = PostgresHook.get_hook(POSTGRES_CONN_ID) with client.write(f'/bronze/{_table_name}.csv', ) as csv_file: logging.info("Exporting table to csv file '%s'", csv_file.name) pg_hook.copy_expert( f"COPY (SELECT * FROM {_table_name}) TO STDOUT WITH HEADER CSV", filename=csv_file)
class DataProcessor: def __init__(self, data_path=None): if data_path == None: self.data_path = r'./config/connect_info.json' else: assert type(data_path) == str self.data_path = data_path if not os.path.exists(self.data_path): self.data_path = r'./connect_info.json' with open(self.data_path) as data_file: data = json.load(data_file) print("Data: ", data) self.hdfs_client = InsecureClient( url='http://' + data['namenode_url'] + ':' + str(data['port']), user=data['user'], root=data['root_path']) print("hdfs client: ", self.hdfs_client) self.img_dir = data['img_dir'] print("img dir: ", self.img_dir) if self.img_dir[-1] != '/': self.img_dir += '/' else: pass self.file_name = 1 def InitImgDir(self): try: list_rslt = self.hdfs_client.list(self.img_dir) if len(list_rslt) > 0: for name in list_rslt: file_path = self.img_dir + name self.hdfs_client.delete(file_path) except util.HdfsError: self.hdfs_client.makedirs(self.img_dir) return True def Upload(self, file_path, threads=2): print("FilePath: ", file_path) print("img_dir: ", self.img_dir[:-1]) self.hdfs_client.upload(hdfs_path=self.img_dir[:-1], local_path=file_path, n_threads=threads, overwrite=True) return 0
class HDFSService(object): def __init__(self): self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******') self.base_path = '/users/root' def mkdir(self, path): return self.hdfs.makedirs(path) def list(self, path): try: return self.hdfs.list(path) except HdfsError as e: print(e) return [] def get(self, path): pass def upload(self, path, local_path=None, data=None): path = self.base_path + path if data is not None: return self.hdfs.write(path, data=data) elif local_path is not None: return self.hdfs.upload(path, local_path) return False pass def download(self, path): path = self.base_path + path with self.hdfs.read(path) as reader: print(path) buf = reader.read() print(len(buf)) return buf
def uploadHDFS(): client = InsecureClient('http://10.41.158.65:50070', user='******') fname1 = client.list(hdfs_path) if rec_dat not in fname1: client.makedirs(hdfs_path + "/" + rec_dat) src = temp_path + '*' backup_path = '/bfdata/buffer/total_pre_arch/' dsc = hdfs_path + rec_dat + '/' print 'hdfs dfs -copyFromLocal ', src, dsc os.system( '/home/hadoop/wistron-hadoop/hadoop-2.7.1/bin/hdfs dfs -copyFromLocal ' + src + ' ' + dsc) print 'mv -f ', temp_path + "*", backup_path os.system('mv -f ' + temp_path + "* " + backup_path) #os.system('/usr/bin/find ' + temp_path + ' -name *.JPG -exec mv {} ' + backup_path + ' \;') end_time = time.time() com_dat = datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')
def handle_uploaded_file(f, data_id, proj_id): file_name = str(f.name) with open(file_name, 'wb+') as destination: for chunk in f.chunks(): destination.write(chunk) data_content = pd.read_csv(file_name, encoding='utf-8') data_json = data_content.to_json() try: client = InsecureClient("http://hdfs.neurolearn.com:50070", user="******") hdfs_path = "/neurolearn/files/" + proj_id + "/datasets/" + data_id client.makedirs(hdfs_path) client.upload(hdfs_path, file_name) except: hdfs_path = '' return data_json, hdfs_path
def get(self): # Récupération du Dataset pour l'évaluation df = get_data_cassandra() print(df.head()) X = df['total_estimated_load'].values # evaluate parameters (p,d,q) <=> (AR, I, MA) p_values = 7 d_values = 0 q_values = 5 #best_cfg, best_score = evaluate_models(X, p_values, d_values, q_values) best_cfg = (p_values,d_values,q_values) # Entrainement du meilleur modèle model = ARIMA(X, order=best_cfg) model_fit = model.fit() # save model if not os.path.exists(model_local_path): # Création du dossier d'export local qui n'existe pas os.makedirs(model_local_path,exist_ok=False) model_fit.save(model_local_path + model_name) # Connexion au client HDFS client = InsecureClient(url='http://namenode:9870', user='******') # Création du dossier de stockage des fichiers traités if client.status(model_hdfs_remote_path,strict=False) == None: client.makedirs(model_hdfs_remote_path) # Copie du modèle sur HDFS remote_load_path = client.upload(model_hdfs_remote_path, model_local_path + model_name,overwrite=True) #print(remote_load_path) print(client.list(model_hdfs_remote_path)) return { 'best_cfg': best_cfg , 'status': 'Terminated'}
def increment_load(tables, cur): for table in tables: tableName = table ts = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S') query = "COPY (SELECT * FROM " + tableName + " where LastModifiedDate>(select run_time from control_table where table_name='" + tableName + "')) TO '/tmp/" + tableName + "_CDC" + ts + ".csv'" cur.execute(query) ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect('172.16.6.89', username='******', password='******') ftp = ssh.open_sftp() ftp.get("/tmp/" + tableName + "_CDC" + ts + ".csv", "Gp/" + tableName + "_CDC" + ts + ".csv") ftp.close() #Connect To hadoop client = InsecureClient('http://172.16.4.144:50070', user='******') client.makedirs("/user/root/greenplum/source/" + tableName + "__ct", "0777") client.upload( "/user/root/greenplum/source/" + tableName + "__ct/", "F:/Srilatha/Attunity-POC/Greenplum/Gp/" + tableName + "_CDC" + ts + ".csv")
class WriteWebHDFS(BatchingBolt): # Batch up the tuples every 5 seconds ticks_between_batches = 5 def initialize(self, conf, ctx): self.conf = conf self.ctx = ctx # Keep track of how many tuples we have seen self.lines = 0 # Open a connection via webHDFS to hadoop cluster # Will need to replace IP address with address of hadoop cluster # Also may need to update local /etc/hosts if that remote # address returns a hostname that does not resolve to the same # address in this file self.client = InsecureClient('http://52.91.211.34:50070/', user='******') # Create an HDFS directory name based on bolt startup time n = datetime.now() self.dirname = 'scanrun-' + n.strftime("%Y%m%d%H%M%S") self.client.makedirs(self.dirname) def process_batch(self, key, tups): # Track number of lines and use it in filename to write self.lines += len(tups) filename = self.dirname + '/' filename = filename + str(self.ctx['taskid']).zfill(2) filename = filename + str(self.lines).zfill(10) filename = filename + '.txt' # Write to HDFS with self.client.write(filename, encoding='utf-8') as writer: for tup in tups: writer.write(tup.values[0] + '\n') self.log(self.lines) self.log(filename)
def full_load(tables, cur): for table in tables: tableName = table ts = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S') query = "COPY (SELECT * FROM " + tableName + ") TO '/tmp/" + tableName + "_FL" + ts + ".csv'" cur.execute(query) ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect('172.16.6.89', username='******', password='******') ftp = ssh.open_sftp() ftp.get("/tmp/" + tableName + "_FL" + ts + ".csv", "Gp/" + tableName + "_FL" + ts + ".csv") ftp.close() #Connect To hadoop client = InsecureClient('http://172.16.4.144:50070', user='******') client.delete("/user/root/greenplum/source/" + tableName, True) client.makedirs("/user/root/greenplum/source/" + tableName, "0777") client.upload( "/user/root/greenplum/source/" + tableName + "/", "F:/Srilatha/Attunity-POC/Greenplum/Gp/" + tableName + "_FL" + ts + ".csv") sql = "INSERT INTO control_table(table_name) VALUES(%s);" cur.execute(sql, (tableName, )) connection.commit()
from hdfs import InsecureClient import os client = InsecureClient("http://localhost:9870", user='******') client.delete("streamInput/area", True) client.makedirs("streamInput/area") # os.removedirs('file')
def start_service(): # Chemin de téléchargement du fichier des données file_path = "/home/formation/Downloads/hrl_load_estimated.csv" #Connexion au client HDFS client = InsecureClient(url='http://*****:*****@default-value='defaultStartDate']"))) #print(driver.page_source) # Saisie des dates de début et de fin elem = driver.find_element_by_xpath( "//input[@default-value='defaultStartDate']") elem.clear() elem.send_keys("01/01/" + str(year)) elem = driver.find_element_by_xpath( "//input[@default-value='defaultEndDate']") elem.clear() elem.send_keys("12/31/" + str(year)) # Attente de rechargement de la page time.sleep(5) element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.XPATH, "//button[text()='Submit']"))) # Envoi du formulaire pour le rechargement des données pour la période demandée elem = driver.find_element_by_xpath("//button[text()='Submit']") elem.click() # Attente de l'afffichage du bouton d'export element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "dm-download"))) elem = driver.find_element_by_class_name("dm-download") elem.click() # Attendre le téléchargement du fichier while not os.path.exists(file_path): time.sleep(1) if os.path.isfile(file_path): print("Fichier téléchargé pour l'année {}".format(year)) # Renommage du fichier new_file_name = file_path.replace(".csv", "_" + str(year) + ".csv") os.rename(file_path, new_file_name) # Upload du fichier CSV local dans le système HDFS try: remote_load_path = client.upload('/user/root/data/pjm', new_file_name, overwrite=True) # print(remote_load_path) except: print("error") print(client.list('/user/root/data/pjm')) else: raise ValueError("%s isn't a file!" % file_path) finally: #driver.quit() print("fin du traitement du fichier") #assert "No results found." not in driver.page_source driver.close() time.sleep(10)
# ==== Writing Dataframe to HDFS ===== with client_hdfs.write('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as writer: df.to_csv(writer) # ====== Reading files ====== with client_hdfs.read('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as reader: df = pd.read_csv(reader, index_col=0) # ==== Getting Content Summary ==== client_hdfs.content('hdfs_path') # ==== Remove a directory or File in HDFS ==== client_hdfs.delete('hdfs_path', recursive=False, skip_trash=True) # ==== Create a Directory ==== client_hdfs.makedirs('hdfs_path', permission=None) # ==== Upload FIle into HDFS ==== client_hdfs.upload('hdfs_path', 'local_path', n_threads=1, temp_dir=None, chunk_size=65536, progress=None, cleanup=True, overwrite=True) # Source : https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
class MasterCrawler: def __init__(self, url_ckan, redis_ip, redis_port): self.ckan = url_ckan self.r = redis.StrictRedis(host=redis_ip, port=redis_port, db=0) self.client = InsecureClient('http://cdh1:50070/', 'admin', root='/user/admin/open_data') def formatUrl(self, url): urlSplit = url.rsplit('/', 1) urlEnd = urllib.quote(urlSplit[1]) urlStart = urlSplit[0] finalUrl = urlStart + "/" + urlEnd return finalUrl def initializeRedis(self): content = self.client.content('dati_gov/dati_gov.json', strict=False) if not content: with self.client.write('dati_gov/dati_gov.json', encoding='utf-8') as writer: writer.write('') request = urllib2.Request(URL_DATI_GOV + "/api/3/action/package_list") response = urllib2.urlopen(request) assert response.code == 200 response_dict = json.loads(response.read()) # Check the contents of the response. assert response_dict['success'] is True result = response_dict['result'] test_res = result #[:2000] for res in test_res: print res self.r.rpush("dataset_id", res) def consumeData(self): red = self.r while (red.llen("dataset_id") != 0): dataset_id = red.lpop("dataset_id") encRes = urllib.urlencode( {"id": unicode(dataset_id).encode('utf-8')}) request_info = urllib2.Request(URL_DATI_GOV + "/api/3/action/package_show?" + encRes) #request_info.add_header("Authorization", "Basic %s" % base64string) try: response_info = urllib2.urlopen(request_info) info_dataset = json.loads(response_info.read()) results = info_dataset['result'] info = results #print json.dumps(info) if 'resources' in info: #print info info["m_status_resources"] = "ok" resources = info['resources'] name = info['name'] idInfo = info['id'] for resource in resources: rUrl = resource['url'] rFormat = resource['format'] rName = resource['name'] rId = resource['id'] finalUrl = self.formatUrl(rUrl) print finalUrl rInfo = urllib2.Request(finalUrl) try: rReq = urllib2.urlopen(rInfo) if rReq.code == 200: resource["m_status"] = "ok" if "csv" in rFormat.lower(): print "qui passo" data = rReq.read() data_dir = "dati_gov/open_api/csv/" + dataset_id existDir = self.client.content( data_dir, strict=False) if not existDir: self.client.makedirs(data_dir) file_path = data_dir + "/" + rId + ".csv" #with self.client.write(file_path, encoding='utf-8') as writer: with self.client.write( file_path) as writer: writer.write(data) if "json" in rFormat.lower(): data = rReq.read() data_dir = "dati_gov/open_api/json/" + dataset_id existDir = self.client.content( data_dir, strict=False) if not existDir: self.client.makedirs(data_dir) file_path = data_dir + "/" + rId + ".json" # with self.client.write(file_path, encoding='utf-8') as writer: with self.client.write( file_path) as writer: writer.write(data) else: resource["m_status"] = "ko" except Exception, e: resource["m_status"] = "ko" print str(e) else: print info info["m_status_resources"] = "ko" print "NO RESOURCES" with self.client.write('dati_gov/dati_gov.json', encoding='utf-8', append=True) as writer: writer.write(json.dumps(info) + '\n') except Exception, e: print str(e) red.lpush("dataset_error", dataset_id)
def kafka_hdfs(opticons=None, hdfshost='', broker='', group='', topics=''): hdfshost = argv[0] broker = argv[1] group = argv[2] topics = argv[3:] # Consumer configuration # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md conf = { 'bootstrap.servers': broker, 'group.id': group, 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'smallest' } } # Check to see if -T option exists for opt in optlist: if opt[0] != '-T': continue try: intval = int(opt[1]) except ValueError: sys.stderr.write("Invalid option value for -T: %s\n" % opt[1]) sys.exit(1) if intval <= 0: sys.stderr.write( "-T option value needs to be larger than zero: %s\n" % opt[1]) sys.exit(1) conf['stats_cb'] = stats_cb conf['statistics.interval.ms'] = int(opt[1]) # Create logger for consumer (logs will be emitted when poll() is called) logger = logging.getLogger('consumer') logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter( logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s')) logger.addHandler(handler) # Create Consumer instance # Hint: try debug='fetch' to generate some log messages c = Consumer(conf, logger=logger) def print_assignment(consumer, partitions): print('Assignment:', partitions) # Subscribe to topics c.subscribe(topics, on_assign=print_assignment) # hdfs login #client = hdfs.Client('http://%s:50070' % (hdfshost)) client = InsecureClient('http://%s:50070' % (hdfshost), user='******') client.makedirs('/kafka') # Read messages from Kafka, print to stdout try: while True: logtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) getper10 = logtime[15:] if getper10 == '0:00': hive_load(10, logtime) msg = c.poll(timeout=1.0) if msg is not None: # continue if msg.error(): # Error or event if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event sys.stderr.write( '%s %s [%d] reached end at offset %d\n' % (logtime, msg.topic(), msg.partition(), msg.offset())) elif msg.error(): # Error raise KafkaException(msg.error()) else: sys.stderr.write('%s %s [%d] at offset %d with key %s:\n' % (logtime, msg.topic(), msg.partition(), msg.offset(), str(msg.value()))) msgstr = msg.value().decode('utf-8') #msgstr = msg.value() msgdict = json.loads(msgstr, object_pairs_hook=OrderedDict) #loads后是无法保证json_data原始顺序的,如果想要保留原有的顺序,那么就需要用到object_pairs_hook database = msgdict.get('database').encode() table = msgdict.get('table').encode() type = msgdict.get('type').encode() hdfsfile = '%s.%s.%s' % (database, table, type) data = msgdict.get('data') if type == 'insert': datalist = data.values() datastr = ','.join('%s' % id for id in datalist).encode() try: with client.write('/kafka/%s' % (hdfsfile), append=True, encoding='utf-8') as writer: writer.write(datastr + '\n') # json.dump(data, writer) except Exception, e: with client.write('/kafka/%s' % (hdfsfile)) as writer: writer.write('') elif type == 'update': with open(hdfsfile, 'a') as writer: json.dump(data, writer) elif type == 'delete': with open(hdfsfile, 'a') as writer: json.dump(data, writer) else: print(type) else:
with client.write('/kafka/%s' % (hdfsfile)) as writer: writer.write('') elif type == 'update': with open(hdfsfile, 'a') as writer: json.dump(data, writer) elif type == 'delete': with open(hdfsfile, 'a') as writer: json.dump(data, writer) else: print(type) else: continue except KeyboardInterrupt: sys.stderr.write('%% Aborted by user\n') # Close down consumer to commit final offsets. c.close() if __name__ == '__main__': optlist, argv = getopt.getopt(sys.argv[1:], 'T:') if len(argv) < 4: print_usage_and_exit(sys.argv[0]) hdfshost = argv[0] client = InsecureClient('http://%s:50070' % (hdfshost), user='******') client.makedirs('/kafka') kafka_hdfs()
# -*- coding: utf-8 -*- # # Copyright © 2018 white <*****@*****.**> # # Distributed under terms of the MIT license. """ https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client """ from hdfs import InsecureClient hdfs_url = "http://192.168.30.125:50070" hdfs_user = "******" c = InsecureClient(hdfs_url, user=hdfs_user) c.write("/test_write", data="string") c.delete("/test_write") c.makedirs("/new/path") # 自动递归创建 with c.read("f.txt", encoding="utf-8") as f: content = f.read() c.write("/test.txt", "test string")
from hdfs import InsecureClient hdfs_client = InsecureClient("http://master004.diablo.hadoop.nm.ted:50070/", user="******") # for fn in hdfs_client.list("/user/slave/websac/tiktok/2021-04-12"): # print(fn) # hdfs_client.delete("/user/slave/websac/tiktok/2021-04-12/" + fn) hdfs_client.makedirs("/user/slave/websac/tiktok/2021-04-12")
import pandas as pd from hdfs import InsecureClient #Cebd1160/Cebd1160! # emr update dfs.namenode.http-address hadoop conf client_hdfs = InsecureClient('http://ec2-34-204-70-68.compute-1.amazonaws.com:50070', 'hadoop') # Listing all files in HDFS fnames = client_hdfs.list('/') print(fnames) client_hdfs.makedirs('/test') # with client_hdfs.write('/test/sample-file.txt') as writer: # writer.write('adding one line to a file called sample-file.txt') # Creating a simple Pandas DataFrame liste_hello = ['hello1', 'hello2'] liste_world = ['world1', 'world2'] df = pd.DataFrame(data={'hello': liste_hello, 'world': liste_world}) # Writing Dataframe to hdfs with client_hdfs.write('/test/helloworld.csv', encoding='utf-8') as writer: df.to_csv(writer) # # # # ====== Reading files ====== # with client_hdfs.read('/test/helloworld.csv', encoding='utf-8') as reader: # df = pd.read_csv(reader, index_col=0) #