def AzCopy( adl, source, target ): 'Moving file from source-location to target-location' multithread.ADLUploader(adl, lpath=source, rpath=target, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) log.debug("%sSource: %s", LEV2, source) log.debug("%sTarget: %s", LEV2, target) return
def upload_file(self, source, destination): multithread.ADLUploader(self.adl_conn_obj, lpath=source, rpath=destination, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)
def uploadfile_FromADS(self,inpath: str,outpath: str): adlsFileSystemClient = self._create_filesytem_conn() multithread.ADLUploader(adlsFileSystemClient, lpath=inpath, rpath=outpath, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)
def uploadfiletoadls(store_name, adls_dir, l_path, r_path): adlsFileSystemClient = core.AzureDLFileSystem(credentials, store_name=store_name) adlsFileSystemClient.mkdir(adls_dir) multithread.ADLUploader(adlsFileSystemClient, lpath=l_path, rpath=r_path, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) return 0
def transfer(self, local_filepath, remote_folderpath): """ Transfer a file by the file name to Azure Datalake Storage instance. Supports single file upload param: local_filepath: Path of the file on the local machine, datatype is Path remote_filepath: Path of the file on the remote machine, datatype is Path """ if not self.__is_valid_path(local_filepath): raise Exception("local_filepath is not valid") if not local_filepath.exists(): raise Exception(local_filepath.name + " not exists at path " + str(local_filepath)) if not self.__is_valid_path(remote_folderpath): raise Exception("remote_filepath is not valid") if not self.has_remote_folder(remote_folderpath): self.make_folder(remote_folderpath) remote_filepath = remote_folderpath.joinpath(local_filepath.name) if self.client.exists(str(remote_filepath)): print("{filename} already exists in datalake!".format( filename=local_filepath.name)) else: """ If the size of the file to be transferred becomes too large, may need to modify local_file_size to transfer the file in smaller chunks """ local_file_size = path.getsize(str(local_filepath)) should_overwrite = True multithread.ADLUploader(self.client, lpath=str(local_filepath), rpath=str(remote_filepath), overwrite=should_overwrite, progress_callback=self.in_progress, buffersize=local_file_size, blocksize=local_file_size) data = { "folder": remote_folderpath, "file": local_filepath.name, "size": local_file_size } if self.__pass_final_check(local_filepath, remote_folderpath, local_file_size): print("Finished transferring {filename}".format( filename=local_filepath.name))
def csv_file(): print("Upload data to the Azure Data lake") todays = today.strftime('%Y-%m-%d') updated_remote_file = remote_path + todays + '.csv' output_name = todays + '.csv' print("Uploading the generated file to the Azure Data Lake....") multithread.ADLUploader(adlsFileSystemClient, lpath=output_name, rpath=updated_remote_file, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) print("All steps completed------------------------------------")
def gateway_list(): print("Uploading gateway list to the Azure Data lake") todays = today.strftime('%Y-%m-%d') updated_remote_gateway_file = remote_gateway_path + todays + '.txt' gateway_list_name = 'gateway_needed_info.txt' print("Uploading the generated file to the Azure Data Lake....") multithread.ADLUploader(adlsFileSystemClient, lpath=gateway_list_name, rpath=updated_remote_gateway_file, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) os.remove('config.py') os.remove('gateway_needed_info.txt') os.remove('gateway_needed_info_old.txt')
def upload_azure_datalake(): try: from azure.datalake.store import core, lib, multithread sp_creds = json.loads(open(os.environ['AZURE_AUTH_LOCATION']).read()) dl_filesystem_creds = lib.auth(tenant_id=json.dumps(sp_creds['tenantId']).replace('"', ''), client_secret=json.dumps(sp_creds['clientSecret']).replace('"', ''), client_id=json.dumps(sp_creds['clientId']).replace('"', ''), resource='https://datalake.azure.net/') datalake_client = core.AzureDLFileSystem(dl_filesystem_creds, store_name=args.azure_datalake_account) for f in dataset_file: multithread.ADLUploader(datalake_client, lpath='/tmp/{0}'.format(f), rpath='{0}/{1}_dataset/{2}'.format(args.storage, args.notebook, f)) except Exception as err: print('Failed to upload test dataset to datalake store', str(err)) sys.exit(1)
def upload_file( self, local_path: str, remote_path: str, nthreads: int = 64, overwrite: bool = True, buffersize: int = 4194304, blocksize: int = 4194304, **kwargs, ) -> Any: """ Upload a file to Azure Data Lake. :param local_path: local path. Can be single file, directory (in which case, upload recursively) or glob pattern. Recursive glob patterns using `**` are not supported. :type local_path: str :param remote_path: Remote path to upload to; if multiple files, this is the directory root to write within. :type remote_path: str :param nthreads: Number of threads to use. If None, uses the number of cores. :type nthreads: int :param overwrite: Whether to forcibly overwrite existing files/directories. If False and remote path is a directory, will quit regardless if any files would be overwritten or not. If True, only matching filenames are actually overwritten. :type overwrite: bool :param buffersize: int [2**22] Number of bytes for internal buffer. This block cannot be bigger than a chunk and cannot be smaller than a block. :type buffersize: int :param blocksize: int [2**22] Number of bytes for a block. Within each chunk, we write a smaller block for each API call. This block cannot be bigger than a chunk. :type blocksize: int """ multithread.ADLUploader( self.get_conn(), lpath=local_path, rpath=remote_path, nthreads=nthreads, overwrite=overwrite, buffersize=buffersize, blocksize=blocksize, **kwargs, )
def upload_one_shot(): for dirName in os.listdir(LOCAL_PATH): partionnedDirName = dirName[:4] + '/' + dirName[4:] partionnedDirName = partionnedDirName[:7] + '/' + partionnedDirName[7:] print(partionnedDirName) adl.mkdir(INGESTION_DATALAKE_PATH + partionnedDirName) for filename in os.listdir(LOCAL_PATH + dirName): if fnmatch.fnmatch(filename, TARGETED_ZIP_FILE): zipFileRef = zipfile.ZipFile(LOCAL_PATH + dirName + "\\" + filename) CSVFilePath = zipFileRef.extract(TARGETED_FILE, UNZIPPED_PATH) multithread.ADLUploader(adl, lpath=CSVFilePath, rpath=INGESTION_DATALAKE_PATH + partionnedDirName, overwrite=True) os.remove(UNZIPPED_PATH + TARGETED_FILE) print('ingesting successful of : ' + partionnedDirName)
def uploadToADL(file_location): # ADL Specific Variables #subscriptionId = '1acf0831-5469-4d44-b6b8-dada2a38f9c4' adlsAccountName = 'rctbraj' RESOURCE = 'https://datalake.azure.net/' # Active Directory Specific Variables tenant = 'de005ab9-410a-421b-b09a-df30074a2a0f' # Directory Id client_id = '6dc493ca-ac45-4ef2-978f-6cdf34d73a58' client_secret = 'pXa9Hsmy5X54ritW3CT9Uiy22KZ4ogYxzWDy6eS/UBs=' # Authenticate and get credentials try: adlCreds = lib.auth(tenant_id=tenant, client_secret=client_secret, client_id=client_id, resource=RESOURCE) except Exception as e: print 'Azue Data Lake Store Authentication Error' # Create a filesystem client object try: adlsFileSystemClient = core.AzureDLFileSystem( adlCreds, store_name=adlsAccountName) except Exception as e: print 'Azue Data Lake Store Client Error' try: multithread.ADLUploader(adlsFileSystemClient, lpath=file_location, rpath='/{}'.format( os.path.basename(file_location)), nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) print 'File uploaded to Azue Data Lake Store ...' except Exception as e: print 'Azue Data Lake Store Upload Error'
def dataLakeUpload(bucket_name): myfilepath = "DIRECTORY/TO/MY/FILES" allfiles = [files for files in sorted(listdir(myfilepath)) if isfile(join(myfilepath, files))] # upload all expect the working one for i in range(len(allfiles)-2): #print(myfilepath + allfiles[i]) fullpath = myfilepath + allfiles[i] datalakepath = '/dataDump/' + allfiles[i] print(datalakepath) try: multithread.ADLUploader(adlsFileSystemClient, lpath=fullpath, rpath=datalakepath, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) print("Uploaded: "+ allfiles[i]) os.unlink(fullpath) pass except BaseException as e: print(e)
def upload_download(adl, diff_list): for element in sorted( [element for element in diff_list if "UPLOAD" in element["action"]], key=lambda x: str(x["type"] + x["name"])): print(element["name"]) multithread.ADLUploader(adl, rpath=element["name"], lpath="./" + element["name"], nthreads=64, overwrite=True, buffersize=4194034, blocksize=4194304) for element in sorted( [element for element in diff_list if "DOWNLOAD" in element["action"]], key=lambda x: str(x["type"] + x["name"])): print(element["name"]) multithread.ADLDownloader(adl, rpath=element["name"], lpath="./" + element["name"], nthreads=64, overwrite=True, buffersize=4194034, blocksize=4194304)
print('Last simulation time:' + simulation_datetime_last_str) simulation_datetime_last = datetime.datetime.strptime( simulation_datetime_last_str, '%m/%d/%Y %H:%M:%S') simulation_datetime_cur = simulation_datetime_last + datetime.timedelta(days=1) simulation_datetime_cur_str = datetime.datetime.strftime( simulation_datetime_cur, '%m/%d/%Y %H:%M:%S') print('Current simulation time:' + simulation_datetime_cur_str) f = open('LastSimulationDatetime.txt', 'w') f.writelines(simulation_datetime_cur_str) f.close() multithread.ADLUploader(adl, lpath='LastSimulationDatetime.txt', rpath='/webjob_log/LastSimulationDatetime.txt', overwrite=True) webjob_simulator = 'Simulator' webjob_optimization = 'InventoryOptimization' webjob_order = 'GenerateOrder' webjob_evaluation = 'Evaluation' #This function constructs url for calling or get the status of a web job def construct_url(webjob_name, action): if action == 'run': url = "https://" + _WEB_APP_USER + ":" + _WEB_APP_PASSWORD + "@" + _WEB_APP_NAME + ".scm.azurewebsites.net/api/triggeredwebjobs/" + webjob_name + "/" + action + "?arguments=\"" + simulation_datetime_cur_str + "\"" else: url = "https://" + _WEB_APP_USER + ":" + _WEB_APP_PASSWORD + "@" + _WEB_APP_NAME + ".scm.azurewebsites.net/api/triggeredwebjobs/" + webjob_name
print("Started solving optimization problem") optimization_start_time = time.time() output_file = OptimizeInventory( adls_file_system_client, args.input_adl_folder, args.partition_str, args.inventory_policy_name, args.optimization_definition, args.solver_name, args.solver_path, args.file_extension, args.directory_name, args.timestamp) print("Total time for generating solution: " + " %s seconds." % (time.time() - optimization_start_time)) upload_result_start_time = time.time() output_remote_dir_name = args.output_adl_folder + '/' + args.directory_name if args.partition_str != 'none': partition = args.partition_str.split(',') for level in partition: output_remote_dir_name += '/' + level print('Uploading file {} to ADL folder [{}]...'.format( output_file, output_remote_dir_name)) multithread.ADLUploader(adls_file_system_client, lpath=output_file, rpath=output_remote_dir_name + '/' + os.path.basename(output_file), overwrite=True) print("Uploading results took " + " %s seconds." % (time.time() - upload_result_start_time)) print("Total time:" + " %s seconds." % (time.time() - start_time_all))
# Perfoms authentication for accessing Azure Data Lake Store (ADLS) token = lib.auth() # Create an ADLS File System Client. The store_name is the name of your ADLS account adlsFileSystemClient = core.AzureDLFileSystem(token, store_name='wesaprod0adlstore') # Create a directory in ADLS adlsFileSystemClient.mkdir('/testDirectoryPython') # Upload file to created directory multithread.ADLUploader( adlsFileSystemClient, lpath='C:\\Users\\aznaik\\Desktop\\PythonADL\\data.csv', rpath='/testDirectoryPython/data.csv', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) # Download file from created directory multithread.ADLDownloader( adlsFileSystemClient, lpath='C:\\Users\\aznaik\\Desktop\\PythonADL\\data.csv', rpath='/testDirectoryPython/data.csv', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) # Delete directory (removes sub-directories/file recursively)
csvfile.write(','.join(fields) + '\n') # Write attributes and kml out to csv for feat in lyr: attributes = feat.items() geom = feat.GetGeometryRef() attributes['kmlgeometry'] = geom.ExportToKML() csvwriter.writerow(attributes) #clean up del csvwriter, lyr, ds csvfile.close() ## Upload the csv file to ADLS multithread.ADLUploader(adlsFileSystemClient, 'csvfiles/testfile.csv', csvfilename, overwrite=True) ## Do SQL stuff! ## Create SQL connection server = 'tcp:ibisqlserver.database.windows.net,1433' database = 'ibidb1' username = '******' password = '******' driver = '{ODBC Driver 13 for SQL Server}' conn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';DATABASE=' + database + ';UID=' + username + ';PWD=' + password) cursor = conn.cursor() ## Get the the column names and data types from ADLS multithread.ADLDownloader(adlsFileSystemClient,
print('Extracting IXI HH data from {}.'.format(fnames[key])) output_dir = os.path.join('/clusters/DLTK_IXI_Dataset', key) ## Create a output directory if not os.path.exists(output_dir): os.makedirs(output_dir) print("outputdir: ", output_dir) with adlsFileSystemClient.open(fnames[key], 'rb') as f: t = tarfile.open(name=key, fileobj=f, mode='r', debug=2) for member in t.getmembers(): if '-HH-' in member.name: t.extract(member, output_dir) ## Extract and store into the Datalakes Store new folder multithread.ADLUploader(adlsFileSystemClient, lpath=output_dir, rpath=output_dir, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) # COMMAND ---------- if PROCESS_OTHER: # Process the demographic xls data and save to csv with adlsFileSystemClient.open('/clusters/DLTK_IXI_Dataset/IXI.xls', 'rb') as f: xls = pd.ExcelFile(f) print(xls.sheet_names) df = xls.parse('Table') for index, row in df.iterrows():
def bench_upload_1_50gb(adl, lpath, rpath, config): return multithread.ADLUploader( adl, lpath=lpath, rpath=rpath, **config[bench_upload_1_50gb.__name__])
20170827, 20170828, 20170831, 20171001, 20171004, 20171008, 20171010, 20171016, 20171017, 20171020, 20171024] for d in range(0,len(dates)): # Read a file into pandas dataframe with adlsFileSystemClient.open('/PROD/API-STAT/Bulk/csv/{}/data{}.csv'.format(dates[d], dates[d]), 'rb') as f: df = pd.read_csv(f) ## for parquet ## import pyarrow as pa table = pa.Table.from_pandas(df) import pyarrow.parquet as pq pq.write_table(table, 'data.parquet') ## Upload a file multithread.ADLUploader(adlsFileSystemClient, lpath='/Users/mattlivingston/PycharmProjects/stat_data_processing/data.parquet', rpath='/PROD/API-STAT/Bulk/parquet/{}/data{}.parquet'.format(dates[d],dates[d]), nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)
return args def client(args): """Create a filesystem client object Parameters: args (class): Arguments. """ adls_client = core.AzureDLFileSystem(store_name=args.account_name) return adls_client if __name__ == "__main__": args = parse() adls_client = client(args) print("Uploading content to ADLS account: {}".format(args.account_name)) print("Uploading {0} into {1}...".format(args.local_folder, args.adls_folder)) threads = multiprocessing.cpu_count() start_time = time.time() multithread.ADLUploader(adls_client, lpath=args.local_folder, rpath=args.adls_folder, nthreads=threads, overwrite=True, buffersize=4194304, blocksize=4194304, verbose=True) print() print("Process time {} seconds".format(time.time() - start_time))
def upload(self, src, dest, overwrite=True): print('Uploading from {} to {}'.format(src, dest)) multithread.ADLUploader(self.adls, dest, src, overwrite=overwrite) print('Uploaded!')
_CLIENT_ID = os.environ['CLIENT_ID'] _CLIENT_SECRET = os.environ['CLIENT_SECRET'] #Web App credentials _WEB_APP_NAME = os.environ['FUNCTIONS_APP_NAME'] _WEB_APP_USER = os.environ['FUNCTIONS_APP_USER'] _WEB_APP_PASSWORD = os.environ['FUNCTIONS_APP_PASSWORD'] #localPath='D:\\home\\site\\wwwroot\\app_data\\jobs\\triggered\\UploadScriptToADLS\\scriptData\\' #Running webjob to upload static/rawdata to Azure DataLake Store webjob_name = 'UploadStaticDataToADLS' url = "https://" + _WEB_APP_USER + ":" + _WEB_APP_PASSWORD + "@" + _WEB_APP_NAME + ".scm.azurewebsites.net/api/triggeredwebjobs/" + webjob_name + "/run" response = requests.post(url) print('Webjob ' + webjob_name + ' started.') #Uploading Script Data to Azure DataLake Store dir_list = next(os.walk('.' + '\\scriptData\\'))[1] token = lib.auth(tenant_id=_TENANT_ID, client_id=_CLIENT_ID, client_secret=_CLIENT_SECRET) adls_file_system_client = core.AzureDLFileSystem(token, store_name=_ADL_NAME) for dir in dir_list: local_path = cwd + '\\scriptData\\' + dir print('Uploading ' + local_path) remote_path = dir + '\\' multithread.ADLUploader(adls_file_system_client, lpath=local_path, rpath=remote_path, overwrite=True)
from azure.mgmt.datalake.store import DataLakeStoreAccountManagementClient from azure.mgmt.datalake.store.models import DataLakeStoreAccount ## Required for Azure Data Lake Store filesystem management from azure.datalake.store import core, lib, multithread # Common Azure imports from azure.mgmt.resource.resources import ResourceManagementClient from azure.mgmt.resource.resources.models import ResourceGroup ## Use these as needed for your application import logging, getpass, pprint, uuid, time ## Declare variables subscriptionId = ###### adlsAccountName = ######## ## Create a filesystem client object adlsFileSystemClient = core.AzureDLFileSystem( store_name=adlsAccountName) adlsFileSystemClient.mkdir('/transformationV2') ## Upload a file multithread.ADLUploader(adlsFileSystemClient, lpath='C:\\data\\mysamplefile.txt', rpath='/mysampledirectory/mysamplefile.txt', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)