def save_file_hdfs(rdd, dir_files_pdf, server_hdfs, user_name_hdfs): n_file_id = int(rdd[0]) n_info_tec = rdd[1].replace("/", "-") n_file = rdd[2] hdfsclient = KerberosClient(server_hdfs) hdfsclient.write(os.path.join(dir_files_pdf, '{}_{}.pdf'.format(n_file_id, n_info_tec)), n_file, overwrite=True) return rdd
def transform(outDir, image, x, y, dt): plt.switch_backend('agg') plt.figure(figsize=(25, 15), dpi=100) p = Proj(proj='geos', h=satHeight, lon_0=satLongitude, sweep=satSweep) XX, YY = np.meshgrid(x, y) lons, lats = p(XX, YY, inverse=True) mH = Basemap(resolution='i', projection='lcc', area_thresh=1500, width=1800 * 3000, height=1060 * 3000, lat_1=38.5, lat_2=38.5, lat_0=38.5, lon_0=-97.5) xH, yH = mH(lons, lats) rgb = image[1][:, :-1, :] rgb = rgb / 256.0 colorTuple = rgb.reshape((rgb.shape[0] * rgb.shape[1]), 3) colorTuple = np.insert(colorTuple, 3, 1.0, axis=1) newmap = mH.pcolormesh(xH, yH, image[1][:, :, 0], color=colorTuple, linewidth=0) newmap.set_array(None) mH.drawstates() mH.drawcountries() mH.drawcoastlines() # plt.title('GOES-16 Pseudo Color\n%s' % dt.strftime('%B %d, %Y UTC')) buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0) buf.seek(0) client = KerberosClient('http://hc.gps.stthomas.edu:50070') with client.write(outDir + '/TRANSFORM_' + image[0].split("/")[-1], overwrite=True) as writer: writer.write(buf.getvalue()) buf.close()
def addMap(outDir, image, satLongitude, xmin, xmax, ymin, ymax, dt): plt.switch_backend('agg') plt.figure(figsize=(25, 15), dpi=100) m = Basemap(projection='geos', lon_0=satLongitude, resolution='i', area_thresh=1000, llcrnrx=xmin, llcrnry=ymin, urcrnrx=xmax, urcrnry=ymax) m.imshow(np.flipud(image[1])) m.drawcoastlines() m.drawcountries() m.drawstates() # plt.title('GOES-16 Pseudo Color\n%s' % dt.strftime('%B %d, %Y UTC')) buf = BytesIO() plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0) buf.seek(0) client = KerberosClient('http://hc.gps.stthomas.edu:50070') with client.write(outDir + '/MAP_' + image[0].split("/")[-1], overwrite=True) as writer: writer.write(buf.getvalue()) buf.close()
print('Fitting model to data...') clf = OneVsRestLogisticRegression(negative_column_index=NEGATIVE_COLUMN_INDEX, class_weight='balanced') clf.fit(X, y) print('Saving to HDFS...') mlb_pickle = pickle.dumps(mlb) vectorizer_pickle = pickle.dumps(vectorizer) clf_pickle = pickle.dumps(clf) formatted_hdfs_path = "/".join(HDFS_MODEL_DIR.split('/')[5:]) current_time = datetime.now().strftime('%Y%m%d%H%M%S') client.write('{}/{}/model/mlb_binarizer.pkl'.format(formatted_hdfs_path, current_time), mlb_pickle, overwrite=True) client.write('{}/{}/model/vectorizer.pkl'.format(formatted_hdfs_path, current_time), vectorizer_pickle, overwrite=True) client.write('{}/{}/model/model.pkl'.format(formatted_hdfs_path, current_time), clf_pickle, overwrite=True) keys_string = 'SNCA_DK\n' + "\n".join([str(int(k)) for k in train_keys]) client.write('{}/{}/model/train_keys.csv'.format(formatted_hdfs_path, current_time), keys_string, overwrite=True)
# y has to be given in str format here, so as to store the results as tuples # even if they have a single element df_results = pd.DataFrame( np.concatenate( (df[ID_COLUMN].values.reshape(-1, 1), np.array([str(p) for p in y]).reshape(-1, 1)), axis=1), columns=[ID_COLUMN, LABEL_COLUMN] ) print('Writing results to HDFS...') formatted_hdfs_path = "/".join(HDFS_MODEL_DIR.split('/')[5:]) current_time = datetime.now().strftime('%Y%m%d%H%M%S') client.write( '{}/{}/results/{}.csv'.format(formatted_hdfs_path, most_recent_date, current_time), df_results.to_csv(index=False), overwrite=True) # Should only commit everything at the end, in a single transaction conn.jconn.setAutoCommit(False) set_module_and_client(curs, 'DUNANT IA') # Some applications of the model should not update the database tables if UPDATE_TABLES: print('Writing results to tables...') for labels, snca_dk in zip(y, df[ID_COLUMN].values): update_motivo_declarado(curs, snca_dk, labels) update_atividade_sindicancia( curs, snca_dk, ROBOT_NAME, ROBOT_NUMBER)
py_logger.info("hive execution completed") client = KerberosClient(hdfs_url) s3 = session.client('s3',use_ssl=False, verify=False) counter = 0 for file_path in file_list_arr: file_path = source_directory + file_path file_name = os.path.basename(file_path) key_name = s3_folder_name + file_name with client.write(file_path) as f: s3.download_fileobj(bucket_name, key_name, f) counter = counter + 1 py_logger.info("File: " + file_path + " downloaded from s3 bucket") py_logger.info("S3 script execution completed. No.of Files downloaded: " + str(counter)) #Compresses the log files which are greater than 30 days today = date.today() current_day = datetime.now().strftime('%d') log_directory = log_file_path.rpartition('/')[0] + log_file_path.rpartition('/')[1] tarFileName = log_directory + today.strftime("%d-%m-%Y") + '.tar.gz' if current_day == "30": # writing files to a compressed file
class OperateHDFS: def __init__(self, url): ''' :param url:HDFS名称节点的主机名或IP地址,以协议为前缀,其次是namenode上的WebHDFS端口,也可以指定多个URL以分号分隔以获取高可用性支持. ''' # 实例化HDFS web client using Kerberos authentication self.client = KerberosClient(url) def file_list(self, file_path): ''' :param file_path: HDFS远程目录路径 :return: 返回一个远程目录中包含的所有文件 ''' file_detail = self.client.list(hdfs_path=file_path) return file_detail def file_read(self, file_path): ''' 从HDFS中读取文件 :param file_path: HDFS远程文件路径 :return: ''' lines = [] with self.client.read(hdfs_path=file_path, encoding='utf-8', delimiter=r'\n') as reader: # content = file.read() # print(content) for item in reader: lines.append(item.strip()) return lines def file_create_write(self, file_path, data_write): ''' 在HDFS中创建新文件并写入内容 :param file_path: HDFS远程文件路径 :param data_write: 写入到文件的数据 :return: ''' self.client.write(hdfs_path=file_path, data=data_write, encoding='utf-8') def file_append_write(self, file_path, data_append): ''' 在HDFS中已存在的文件中追加写入内容,文件必须已存在 :param file_path: HDFS远程文件路径 :param data_append: 追加到文件的数据 :return: ''' self.client.write(hdfs_path=file_path, data=data_append, encoding='utf-8', append=True) def file_rename(self, src_file_path, dst_file_path): ''' 重命名/移动文件或文件夹 :param src_file_path: 源文件路径 :param dst_file_path: 目的文件路径 :return: ''' self.client.rename(hdfs_src_path=src_file_path, hdfs_dst_path=dst_file_path) def mkdir(self, file_path): ''' 在HDFS中创建远程目录,必要时递归创建 :param file_path: 需要新建的文件夹路径(包含名字) :return: ''' self.client.makedirs(hdfs_path=file_path) def upload_files(self, file_path, local_path): ''' 上传文件或目录到HDFS :param file_path:HDFS目标路径。如果它已经存在并且是一个目录,文件将被上传其中。 :param local_path:文件或文件夹的本地路径。 如果是文件夹,则将上传其中的所有文件(请注意,这意味着没有文件的文件夹将不会远程创建) :return:hdfs_path_return:成功后,此方法将返回远程上传路径。 ''' hdfs_path_return = self.client.upload(hdfs_path=file_path, local_path=local_path) return hdfs_path_return def download_files(self, file_path, local_path): ''' 从HDFS下载一个文件或文件夹并将其保存在本地 :param file_path:HDFS上要下载的文件或文件夹的路径。 如果是文件夹,则将下载该文件夹下的所有文件 :param local_path:本地路径。 如果它已经存在并且是目录,则文件将在其中下载。 :return: local_path_return:成功后,此方法将返回本地下载路径 ''' local_path_return = self.client.download(hdfs_path=file_path, local_path=local_path) return local_path_return def delete_files(self, file_path): ''' 从HDFS中删除文件或目录 :param file_path: HDFS中需要删除的文件或目录的路径 :return:如果删除成功,则此函数返回“ True”,如果先前在“ hdfs_path”处不存在文件或目录,则返回“ False”。 ''' # recursive:递归删除文件和目录。 默认情况下,如果尝试删除非空目录,则此方法将引发HdfsError。 # skip_trash:设置为false时,已删除的路径将被移至相应的垃圾文件夹,而不是被删除。 这需要Hadoop 2.9+且在集群上启用trash return self.client.delete(hdfs_path=file_path, recursive=False, skip_trash=True) def set_files_permission(self, file_path): ''' 更改文件的权限 :param file_path: 需要更改权限的文件路径 :return: ''' # permission:文件的新八进制权限字符串 self.client.set_permission(hdfs_path=file_path, permission=None)