Beispiel #1
0
def AzCopy( adl, source, target ):
  'Moving file from source-location to target-location'
  multithread.ADLUploader(adl, lpath=source, rpath=target, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)
  log.debug("%sSource: %s", LEV2, source)
  log.debug("%sTarget: %s", LEV2, target)
  
  return
Beispiel #2
0
 def upload_file(self, source, destination):
     multithread.ADLUploader(self.adl_conn_obj,
                             lpath=source,
                             rpath=destination,
                             nthreads=64,
                             overwrite=True,
                             buffersize=4194304,
                             blocksize=4194304)
Beispiel #3
0
 def uploadfile_FromADS(self,inpath: str,outpath: str):
     adlsFileSystemClient = self._create_filesytem_conn()
     multithread.ADLUploader(adlsFileSystemClient,
                     lpath=inpath,
                     rpath=outpath,
                     nthreads=64,
                     overwrite=True,
                     buffersize=4194304,
                     blocksize=4194304)
def uploadfiletoadls(store_name, adls_dir, l_path, r_path):
    adlsFileSystemClient = core.AzureDLFileSystem(credentials,
                                                  store_name=store_name)
    adlsFileSystemClient.mkdir(adls_dir)
    multithread.ADLUploader(adlsFileSystemClient,
                            lpath=l_path,
                            rpath=r_path,
                            nthreads=64,
                            overwrite=True,
                            buffersize=4194304,
                            blocksize=4194304)
    return 0
Beispiel #5
0
    def transfer(self, local_filepath, remote_folderpath):
        """
        Transfer a file by the file name to Azure Datalake Storage instance.
        Supports single file upload
        param: local_filepath: Path of the file on the local machine, datatype is Path
               remote_filepath: Path of the file on the remote machine, datatype is Path
        """

        if not self.__is_valid_path(local_filepath):
            raise Exception("local_filepath is not valid")

        if not local_filepath.exists():
            raise Exception(local_filepath.name + " not exists at path " +
                            str(local_filepath))

        if not self.__is_valid_path(remote_folderpath):
            raise Exception("remote_filepath is not valid")

        if not self.has_remote_folder(remote_folderpath):
            self.make_folder(remote_folderpath)

        remote_filepath = remote_folderpath.joinpath(local_filepath.name)

        if self.client.exists(str(remote_filepath)):
            print("{filename} already exists in datalake!".format(
                filename=local_filepath.name))

        else:
            """
            If the size of the file to be transferred becomes too large, may need to modify 
            local_file_size to transfer the file in smaller chunks 
            """
            local_file_size = path.getsize(str(local_filepath))
            should_overwrite = True

            multithread.ADLUploader(self.client,
                                    lpath=str(local_filepath),
                                    rpath=str(remote_filepath),
                                    overwrite=should_overwrite,
                                    progress_callback=self.in_progress,
                                    buffersize=local_file_size,
                                    blocksize=local_file_size)

            data = {
                "folder": remote_folderpath,
                "file": local_filepath.name,
                "size": local_file_size
            }

            if self.__pass_final_check(local_filepath, remote_folderpath,
                                       local_file_size):
                print("Finished transferring {filename}".format(
                    filename=local_filepath.name))
Beispiel #6
0
def csv_file():
    print("Upload data to the Azure Data lake")
    todays = today.strftime('%Y-%m-%d')
    updated_remote_file = remote_path + todays + '.csv'
    output_name = todays + '.csv'
    print("Uploading the generated file to the Azure Data Lake....")
    multithread.ADLUploader(adlsFileSystemClient,
                            lpath=output_name,
                            rpath=updated_remote_file,
                            nthreads=64,
                            overwrite=True,
                            buffersize=4194304,
                            blocksize=4194304)
    print("All steps completed------------------------------------")
Beispiel #7
0
def gateway_list():
    print("Uploading gateway list  to the Azure Data lake")
    todays = today.strftime('%Y-%m-%d')
    updated_remote_gateway_file = remote_gateway_path + todays + '.txt'
    gateway_list_name = 'gateway_needed_info.txt'
    print("Uploading the generated file to the Azure Data Lake....")
    multithread.ADLUploader(adlsFileSystemClient,
                            lpath=gateway_list_name,
                            rpath=updated_remote_gateway_file,
                            nthreads=64,
                            overwrite=True,
                            buffersize=4194304,
                            blocksize=4194304)
    os.remove('config.py')
    os.remove('gateway_needed_info.txt')
    os.remove('gateway_needed_info_old.txt')
Beispiel #8
0
def upload_azure_datalake():
    try:
        from azure.datalake.store import core, lib, multithread
        sp_creds = json.loads(open(os.environ['AZURE_AUTH_LOCATION']).read())
        dl_filesystem_creds = lib.auth(tenant_id=json.dumps(sp_creds['tenantId']).replace('"', ''),
                                       client_secret=json.dumps(sp_creds['clientSecret']).replace('"', ''),
                                       client_id=json.dumps(sp_creds['clientId']).replace('"', ''),
                                       resource='https://datalake.azure.net/')
        datalake_client = core.AzureDLFileSystem(dl_filesystem_creds, store_name=args.azure_datalake_account)
        for f in dataset_file:
            multithread.ADLUploader(datalake_client,
                                    lpath='/tmp/{0}'.format(f),
                                    rpath='{0}/{1}_dataset/{2}'.format(args.storage, args.notebook, f))
    except Exception as err:
        print('Failed to upload test dataset to datalake store', str(err))
        sys.exit(1)
Beispiel #9
0
    def upload_file(
        self,
        local_path: str,
        remote_path: str,
        nthreads: int = 64,
        overwrite: bool = True,
        buffersize: int = 4194304,
        blocksize: int = 4194304,
        **kwargs,
    ) -> Any:
        """
        Upload a file to Azure Data Lake.

        :param local_path: local path. Can be single file, directory (in which case,
            upload recursively) or glob pattern. Recursive glob patterns using `**`
            are not supported.
        :type local_path: str
        :param remote_path: Remote path to upload to; if multiple files, this is the
            directory root to write within.
        :type remote_path: str
        :param nthreads: Number of threads to use. If None, uses the number of cores.
        :type nthreads: int
        :param overwrite: Whether to forcibly overwrite existing files/directories.
            If False and remote path is a directory, will quit regardless if any files
            would be overwritten or not. If True, only matching filenames are actually
            overwritten.
        :type overwrite: bool
        :param buffersize: int [2**22]
            Number of bytes for internal buffer. This block cannot be bigger than
            a chunk and cannot be smaller than a block.
        :type buffersize: int
        :param blocksize: int [2**22]
            Number of bytes for a block. Within each chunk, we write a smaller
            block for each API call. This block cannot be bigger than a chunk.
        :type blocksize: int
        """
        multithread.ADLUploader(
            self.get_conn(),
            lpath=local_path,
            rpath=remote_path,
            nthreads=nthreads,
            overwrite=overwrite,
            buffersize=buffersize,
            blocksize=blocksize,
            **kwargs,
        )
Beispiel #10
0
def upload_one_shot():
    for dirName in os.listdir(LOCAL_PATH):
        partionnedDirName = dirName[:4] + '/' + dirName[4:]
        partionnedDirName = partionnedDirName[:7] + '/' + partionnedDirName[7:]
        print(partionnedDirName)
        adl.mkdir(INGESTION_DATALAKE_PATH + partionnedDirName)
        for filename in os.listdir(LOCAL_PATH + dirName):
            if fnmatch.fnmatch(filename, TARGETED_ZIP_FILE):
                zipFileRef = zipfile.ZipFile(LOCAL_PATH + dirName + "\\" +
                                             filename)
                CSVFilePath = zipFileRef.extract(TARGETED_FILE, UNZIPPED_PATH)
                multithread.ADLUploader(adl,
                                        lpath=CSVFilePath,
                                        rpath=INGESTION_DATALAKE_PATH +
                                        partionnedDirName,
                                        overwrite=True)
                os.remove(UNZIPPED_PATH + TARGETED_FILE)
                print('ingesting successful of : ' + partionnedDirName)
def uploadToADL(file_location):

    # ADL Specific Variables
    #subscriptionId = '1acf0831-5469-4d44-b6b8-dada2a38f9c4'
    adlsAccountName = 'rctbraj'
    RESOURCE = 'https://datalake.azure.net/'

    # Active Directory Specific Variables
    tenant = 'de005ab9-410a-421b-b09a-df30074a2a0f'  # Directory Id
    client_id = '6dc493ca-ac45-4ef2-978f-6cdf34d73a58'
    client_secret = 'pXa9Hsmy5X54ritW3CT9Uiy22KZ4ogYxzWDy6eS/UBs='

    # Authenticate and get credentials
    try:
        adlCreds = lib.auth(tenant_id=tenant,
                            client_secret=client_secret,
                            client_id=client_id,
                            resource=RESOURCE)
    except Exception as e:
        print 'Azue Data Lake Store Authentication Error'

    # Create a filesystem client object
    try:
        adlsFileSystemClient = core.AzureDLFileSystem(
            adlCreds, store_name=adlsAccountName)
    except Exception as e:
        print 'Azue Data Lake Store Client Error'
    try:
        multithread.ADLUploader(adlsFileSystemClient,
                                lpath=file_location,
                                rpath='/{}'.format(
                                    os.path.basename(file_location)),
                                nthreads=64,
                                overwrite=True,
                                buffersize=4194304,
                                blocksize=4194304)
        print 'File uploaded to Azue Data Lake Store ...'
    except Exception as e:
        print 'Azue Data Lake Store Upload Error'
Beispiel #12
0
    def dataLakeUpload(bucket_name):
        myfilepath = "DIRECTORY/TO/MY/FILES"
        allfiles = [files for files in sorted(listdir(myfilepath)) 
                    if isfile(join(myfilepath, files))]
        # upload all expect the working one
        for i in range(len(allfiles)-2): 
            #print(myfilepath + allfiles[i])
            fullpath =  myfilepath + allfiles[i]

            datalakepath = '/dataDump/' + allfiles[i]
            print(datalakepath)
            
            try: 
                multithread.ADLUploader(adlsFileSystemClient, 
                        lpath=fullpath, 
                        rpath=datalakepath, nthreads=64, 
                        overwrite=True, buffersize=4194304, blocksize=4194304)
                print("Uploaded: "+ allfiles[i])
                os.unlink(fullpath)
                pass
            except BaseException as e:
                print(e)
Beispiel #13
0
def upload_download(adl, diff_list):
    for element in sorted(
        [element for element in diff_list if "UPLOAD" in element["action"]],
            key=lambda x: str(x["type"] + x["name"])):
        print(element["name"])
        multithread.ADLUploader(adl,
                                rpath=element["name"],
                                lpath="./" + element["name"],
                                nthreads=64,
                                overwrite=True,
                                buffersize=4194034,
                                blocksize=4194304)
    for element in sorted(
        [element for element in diff_list if "DOWNLOAD" in element["action"]],
            key=lambda x: str(x["type"] + x["name"])):
        print(element["name"])
        multithread.ADLDownloader(adl,
                                  rpath=element["name"],
                                  lpath="./" + element["name"],
                                  nthreads=64,
                                  overwrite=True,
                                  buffersize=4194034,
                                  blocksize=4194304)
print('Last simulation time:' + simulation_datetime_last_str)

simulation_datetime_last = datetime.datetime.strptime(
    simulation_datetime_last_str, '%m/%d/%Y %H:%M:%S')
simulation_datetime_cur = simulation_datetime_last + datetime.timedelta(days=1)
simulation_datetime_cur_str = datetime.datetime.strftime(
    simulation_datetime_cur, '%m/%d/%Y %H:%M:%S')

print('Current simulation time:' + simulation_datetime_cur_str)

f = open('LastSimulationDatetime.txt', 'w')
f.writelines(simulation_datetime_cur_str)
f.close()

multithread.ADLUploader(adl,
                        lpath='LastSimulationDatetime.txt',
                        rpath='/webjob_log/LastSimulationDatetime.txt',
                        overwrite=True)

webjob_simulator = 'Simulator'
webjob_optimization = 'InventoryOptimization'
webjob_order = 'GenerateOrder'
webjob_evaluation = 'Evaluation'


#This function constructs url for calling or get the status of a web job
def construct_url(webjob_name, action):
    if action == 'run':
        url = "https://" + _WEB_APP_USER + ":" + _WEB_APP_PASSWORD + "@" + _WEB_APP_NAME + ".scm.azurewebsites.net/api/triggeredwebjobs/" + webjob_name + "/" + action + "?arguments=\"" + simulation_datetime_cur_str + "\""
    else:
        url = "https://" + _WEB_APP_USER + ":" + _WEB_APP_PASSWORD + "@" + _WEB_APP_NAME + ".scm.azurewebsites.net/api/triggeredwebjobs/" + webjob_name
Beispiel #15
0
    print("Started solving optimization problem")
    optimization_start_time = time.time()
    output_file = OptimizeInventory(
        adls_file_system_client, args.input_adl_folder, args.partition_str,
        args.inventory_policy_name, args.optimization_definition,
        args.solver_name, args.solver_path, args.file_extension,
        args.directory_name, args.timestamp)
    print("Total time for generating solution: " + " %s seconds." %
          (time.time() - optimization_start_time))

    upload_result_start_time = time.time()
    output_remote_dir_name = args.output_adl_folder + '/' + args.directory_name
    if args.partition_str != 'none':
        partition = args.partition_str.split(',')
        for level in partition:
            output_remote_dir_name += '/' + level

    print('Uploading file {} to ADL folder [{}]...'.format(
        output_file, output_remote_dir_name))

    multithread.ADLUploader(adls_file_system_client,
                            lpath=output_file,
                            rpath=output_remote_dir_name + '/' +
                            os.path.basename(output_file),
                            overwrite=True)
    print("Uploading results took " + " %s seconds." %
          (time.time() - upload_result_start_time))

    print("Total time:" + " %s seconds." % (time.time() - start_time_all))
Beispiel #16
0
# Perfoms authentication for accessing Azure Data Lake Store (ADLS)
token = lib.auth()

# Create an ADLS File System Client. The store_name is the name of your ADLS account
adlsFileSystemClient = core.AzureDLFileSystem(token,
                                              store_name='wesaprod0adlstore')

# Create a directory in ADLS
adlsFileSystemClient.mkdir('/testDirectoryPython')

# Upload file to created directory
multithread.ADLUploader(
    adlsFileSystemClient,
    lpath='C:\\Users\\aznaik\\Desktop\\PythonADL\\data.csv',
    rpath='/testDirectoryPython/data.csv',
    nthreads=64,
    overwrite=True,
    buffersize=4194304,
    blocksize=4194304)

# Download file from created directory
multithread.ADLDownloader(
    adlsFileSystemClient,
    lpath='C:\\Users\\aznaik\\Desktop\\PythonADL\\data.csv',
    rpath='/testDirectoryPython/data.csv',
    nthreads=64,
    overwrite=True,
    buffersize=4194304,
    blocksize=4194304)

# Delete directory (removes sub-directories/file recursively)
    csvfile.write(','.join(fields) + '\n')

# Write attributes and kml out to csv
for feat in lyr:
    attributes = feat.items()
    geom = feat.GetGeometryRef()
    attributes['kmlgeometry'] = geom.ExportToKML()
    csvwriter.writerow(attributes)

#clean up
del csvwriter, lyr, ds
csvfile.close()

## Upload the csv file to ADLS
multithread.ADLUploader(adlsFileSystemClient,
                        'csvfiles/testfile.csv',
                        csvfilename,
                        overwrite=True)

## Do SQL stuff!
## Create SQL connection
server = 'tcp:ibisqlserver.database.windows.net,1433'
database = 'ibidb1'
username = '******'
password = '******'
driver = '{ODBC Driver 13 for SQL Server}'
conn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';DATABASE=' +
                      database + ';UID=' + username + ';PWD=' + password)
cursor = conn.cursor()

## Get the the column names and data types from ADLS
multithread.ADLDownloader(adlsFileSystemClient,
Beispiel #18
0
            print('Extracting IXI HH data from {}.'.format(fnames[key]))
            output_dir = os.path.join('/clusters/DLTK_IXI_Dataset', key)
            ## Create a output directory
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            print("outputdir: ", output_dir)
            with adlsFileSystemClient.open(fnames[key], 'rb') as f:
                t = tarfile.open(name=key, fileobj=f, mode='r', debug=2)
                for member in t.getmembers():
                    if '-HH-' in member.name:
                        t.extract(member, output_dir)
                        ## Extract and store into the Datalakes Store new folder
                        multithread.ADLUploader(adlsFileSystemClient,
                                                lpath=output_dir,
                                                rpath=output_dir,
                                                nthreads=64,
                                                overwrite=True,
                                                buffersize=4194304,
                                                blocksize=4194304)

# COMMAND ----------

if PROCESS_OTHER:
    # Process the demographic xls data and save to csv
    with adlsFileSystemClient.open('/clusters/DLTK_IXI_Dataset/IXI.xls',
                                   'rb') as f:
        xls = pd.ExcelFile(f)
    print(xls.sheet_names)

    df = xls.parse('Table')
    for index, row in df.iterrows():
Beispiel #19
0
def bench_upload_1_50gb(adl, lpath, rpath, config):
    return multithread.ADLUploader(
        adl,
        lpath=lpath,
        rpath=rpath,
        **config[bench_upload_1_50gb.__name__])
Beispiel #20
0
20170827,
20170828,
20170831,
20171001,
20171004,
20171008,
20171010,
20171016,
20171017,
20171020,
20171024]


for d in range(0,len(dates)):

    # Read a file into pandas dataframe
    with adlsFileSystemClient.open('/PROD/API-STAT/Bulk/csv/{}/data{}.csv'.format(dates[d], dates[d]), 'rb') as f:
        df = pd.read_csv(f)

    ## for parquet  ##

    import pyarrow as pa
    table = pa.Table.from_pandas(df)
    import pyarrow.parquet as pq
    pq.write_table(table, 'data.parquet')

    ## Upload a file
    multithread.ADLUploader(adlsFileSystemClient, lpath='/Users/mattlivingston/PycharmProjects/stat_data_processing/data.parquet',
                            rpath='/PROD/API-STAT/Bulk/parquet/{}/data{}.parquet'.format(dates[d],dates[d]), nthreads=64, overwrite=True,
                            buffersize=4194304, blocksize=4194304)
    return args


def client(args):
    """Create a filesystem client object
    Parameters:
        args (class): Arguments.
    """
    adls_client = core.AzureDLFileSystem(store_name=args.account_name)
    return adls_client


if __name__ == "__main__":
    args = parse()
    adls_client = client(args)
    print("Uploading content to ADLS account: {}".format(args.account_name))
    print("Uploading {0} into {1}...".format(args.local_folder,
                                             args.adls_folder))
    threads = multiprocessing.cpu_count()
    start_time = time.time()
    multithread.ADLUploader(adls_client,
                            lpath=args.local_folder,
                            rpath=args.adls_folder,
                            nthreads=threads,
                            overwrite=True,
                            buffersize=4194304,
                            blocksize=4194304,
                            verbose=True)
    print()
    print("Process time {} seconds".format(time.time() - start_time))
Beispiel #22
0
 def upload(self, src, dest, overwrite=True):
     print('Uploading from {} to {}'.format(src, dest))
     multithread.ADLUploader(self.adls, dest, src, overwrite=overwrite)
     print('Uploaded!')
_CLIENT_ID = os.environ['CLIENT_ID']
_CLIENT_SECRET = os.environ['CLIENT_SECRET']

#Web App credentials
_WEB_APP_NAME = os.environ['FUNCTIONS_APP_NAME']
_WEB_APP_USER = os.environ['FUNCTIONS_APP_USER']
_WEB_APP_PASSWORD = os.environ['FUNCTIONS_APP_PASSWORD']

#localPath='D:\\home\\site\\wwwroot\\app_data\\jobs\\triggered\\UploadScriptToADLS\\scriptData\\'

#Running webjob to upload static/rawdata to Azure DataLake Store
webjob_name = 'UploadStaticDataToADLS'
url = "https://" + _WEB_APP_USER + ":" + _WEB_APP_PASSWORD + "@" + _WEB_APP_NAME + ".scm.azurewebsites.net/api/triggeredwebjobs/" + webjob_name + "/run"
response = requests.post(url)
print('Webjob ' + webjob_name + ' started.')

#Uploading Script Data to Azure DataLake Store
dir_list = next(os.walk('.' + '\\scriptData\\'))[1]
token = lib.auth(tenant_id=_TENANT_ID,
                 client_id=_CLIENT_ID,
                 client_secret=_CLIENT_SECRET)
adls_file_system_client = core.AzureDLFileSystem(token, store_name=_ADL_NAME)

for dir in dir_list:
    local_path = cwd + '\\scriptData\\' + dir
    print('Uploading ' + local_path)
    remote_path = dir + '\\'
    multithread.ADLUploader(adls_file_system_client,
                            lpath=local_path,
                            rpath=remote_path,
                            overwrite=True)
Beispiel #24
0
from azure.mgmt.datalake.store import DataLakeStoreAccountManagementClient
from azure.mgmt.datalake.store.models import DataLakeStoreAccount

## Required for Azure Data Lake Store filesystem management
from azure.datalake.store import core, lib, multithread

# Common Azure imports
from azure.mgmt.resource.resources import ResourceManagementClient
from azure.mgmt.resource.resources.models import ResourceGroup

## Use these as needed for your application
import logging, getpass, pprint, uuid, time

## Declare variables
subscriptionId = ######
adlsAccountName = ########

## Create a filesystem client object
adlsFileSystemClient = core.AzureDLFileSystem( store_name=adlsAccountName)

adlsFileSystemClient.mkdir('/transformationV2')


## Upload a file
multithread.ADLUploader(adlsFileSystemClient, 
                        lpath='C:\\data\\mysamplefile.txt', 
                        rpath='/mysampledirectory/mysamplefile.txt', 
                        nthreads=64, 
                        overwrite=True, buffersize=4194304, 
                        blocksize=4194304)