def transformation(store_name: str, sql_query: "sql_query", sink_table_name: str): """ Execute a sql query on a database and store the results in another table in the same database """ request = { "SqlTransformationParameters": { "DataStoreName": store_name, "SqlQuery": sql_query }, "SinkTableName": sink_table_name } response = neuro_call("80", "DataMovementService", "SqlTransformation", request) check_request = {"JobId": response["JobId"]} status = 0 errormsg = "" while status == 0: time.sleep(1) response_c = neuro_call("80", "DataMovementService", "CheckJob", check_request) status = response_c["Status"] if status > 1: errormsg = response_c["Message"] neuro_call("80", "DataMovementService", "FinaliseJob", check_request) if status != 1: raise Exception("Neuroverse error: " + errormsg) return {"JobId": response["JobId"], "TimeStamp": response["TimeStamp"]}
def delete_datalake_file(store_name: str, table_name: str, file_name_including_partition: str): """ Delete a file from a processed datalake table in Neuroverse """ table_def = sm.get_table_definition(store_name, table_name) schema_type = list(sm.SCHEMA_TYPE_MAP.keys())[list(sm.SCHEMA_TYPE_MAP.values()).index(table_def["SchemaType"])] file_path = "/managed/" + schema_type + "/table/" + table_name + "/" file_path = file_path.lower() file_path += file_name_including_partition.strip('/') request = {"DataStoreName" : store_name, "TableName" : table_name, "FilePath" : file_path} response = neuro_call("80", "DataMovementService", "DataLakeDeleteFile", request) check_request = {"JobId" : response["JobId"]} status = 0 errormsg = "" while status == 0: time.sleep(1) response_c = neuro_call("80", "DataMovementService", "CheckJob", check_request) status = response_c["Status"] if status > 1: errormsg = response_c["Message"] neuro_call("80", "DataMovementService", "FinaliseJob", check_request) if status != 1: raise Exception("Neuroverse error: " + errormsg) return {"JobId" : response["JobId"], "TimeStamp" : response["TimeStamp"]}
def stream(source: "SourceParameters", sink: "SinkParameters"): """ Stream data from a tabular data source to a tabular data sink """ request = {"SourceParameters": source, "SinkParameters": sink} method = source["Type"] + "To" + sink["Type"] response = neuro_call("80", "DataMovementService", method, request) check_request = {"JobId": response["JobId"]} status = 0 errormsg = "" while status == 0: time.sleep(1) response_c = neuro_call("80", "DataMovementService", "CheckJob", check_request) status = response_c["Status"] if status > 1: errormsg = response_c["Message"] neuro_call("80", "DataMovementService", "FinaliseJob", check_request) if status != 1: raise Exception("Neuroverse error: " + errormsg) return {"JobId": response["JobId"], "TimeStamp": response["TimeStamp"]}
def get_lines_in_datalake_csv(store_name: str, table_name: str, file_name_including_partition: str): """ Get the number of lines for a file in a datalake """ table_def = sm.get_table_definition(store_name, table_name) schema_type = list(sm.SCHEMA_TYPE_MAP.keys())[list(sm.SCHEMA_TYPE_MAP.values()).index(table_def["SchemaType"])] file_path = "/managed/" + schema_type + "/table/" + table_name + "/" file_path = file_path.lower() file_path += file_name_including_partition.strip('/') request = {"DataStoreName" : store_name, "TableName" : table_name, "FilePath" : file_path} response = neuro_call("80", "DataMovementService", "GetLinesInDataLakeCsvFile", request) check_request = {"JobId" : response["JobId"]} status = 0 errormsg = "" while status == 0: time.sleep(1) response_c = neuro_call("80", "DataMovementService", "CheckJob", check_request) status = response_c["Status"] errormsg = response_c["Message"] neuro_call("80", "DataMovementService", "FinaliseJob", check_request) if status != 1: raise Exception("Neuroverse error: " + errormsg) return int(errormsg)
def delete_rows(store_name: str, table_name: str, where_clause: str = None): """ Delete rows of a sql table using a where clause. If no where clause is supplied all rows are deleted """ request = { "DataStoreName": store_name, "TableName": table_name, "WhereClause": where_clause } response = neuro_call("80", "DataMovementService", "SqlDelete", request) check_request = {"JobId": response["JobId"]} status = 0 errormsg = "" while status == 0: time.sleep(1) response_c = neuro_call("80", "DataMovementService", "CheckJob", check_request) status = response_c["Status"] if status > 1: errormsg = response_c["Message"] neuro_call("80", "DataMovementService", "FinaliseJob", check_request) if status != 1: raise Exception("Neuroverse error: " + errormsg) return None
def create_data_store(data_store_name: str, data_store_type: "DataStoreType", data_store_tier: "StoreTierType" = StoreTierType.Small): request = { "StoreName": data_store_name, "DataStoreTypeId": data_store_type.value, "StoreTierTypeId": data_store_tier.value } neuro_call('80', 'datastoremanager', 'createdatastore', request)
def add_table_indexes(store_name: str, table_name: str, table_indexes: "List[index_definition]"): """ Add indexes to a table in a Neuroverse SQL data store """ table_def = get_table_definition(store_name, table_name) table_def["DestinationTableDefinitionIndexes"].append(table_indexes) neuro_call("80", "datapopulation", "UpdateDestinationTableDefinition", table_def)
def delete_data_store(data_store_name: str): datastore = [ ds for ds in list_data_stores()['DataStores'] if ds['StoreName'] == data_store_name ] #Require interactive check = input("Are you sure you want to delete %s (y/n)" % data_store_name) if check == 'y': request = {'DataStoreId': datastore['DataStoreId']} neuro_call('80', 'datastoremanager', 'deletedatastore', request) return "%s has been deleted" % data_store_name
def sql_to_csv(store_name: str, sql_query: "sql_query", file_name: str): """ Execute a sql query and have the result put in a csv file in your notebook session """ file_name = (os.getcwd().replace(home_directory(), "") + "/" + file_name).strip('/') path_list = file_name.split('/') indices = [i for i, x in enumerate(path_list) if x == ".."] new_indices = [] for ind in indices: new_indices.append(ind - 1) new_indices.append(ind) new_path_list = [] for i in range(0, len(path_list)): if i not in new_indices: new_path_list.append(path_list[i]) file_name = "/".join(new_path_list) request = { "SqlParameters": { "DataStoreName": store_name, "SqlQuery": sql_query }, "FileName": file_name } response = neuro_call("80", "DataMovementService", "SqlQueryToCsvNotebookFileShare", request) check_request = {"JobId": response["JobId"]} status = 0 errormsg = "" while status == 0: time.sleep(1) response_c = neuro_call("80", "DataMovementService", "CheckJob", check_request) status = response_c["Status"] if status > 1: errormsg = response_c["Message"] neuro_call("80", "DataMovementService", "FinaliseJob", check_request) if status != 1: raise Exception("Neuroverse error: " + errormsg) return None
def list_views(store_name: str): """ List SQL views """ data_stores = neuro_call("80", "datastoremanager", "GetDataStores", {"StoreName": store_name})["DataStores"] if len(data_stores) == 0: raise Exception("Data store doesn't exist") response = neuro_call("80", "datapopulation", "ListDataPopulationViews", {"DataStoreId": data_stores[0]["DataStoreId"]}, controller="DataPopulationView") return response["Names"]
def list_active_sessions(): """ Get list of active notebook sessions """ return neuro_call("8080", "notebookmanagementservice", "GetDetailedSessionList", None)
def delete_view(store_name: str, view_name: str): """ Delete a SQL view """ data_stores = neuro_call("80", "datastoremanager", "GetDataStores", {"StoreName": store_name})["DataStores"] if len(data_stores) == 0: raise Exception("Data store doesn't exist") neuro_call("80", "datapopulation", "DeleteDataPopulationView", { "DataStoreId": data_stores[0]["DataStoreId"], "Name": view_name }, controller="DataPopulationView")
def list_libraries(workspace_id: str = None, cluster_id: str = None, show_all: bool = False): """ List the non default libraries available on the cluster """ list_jobs_response = neuro_call("80", "sparkmanager", "ListClusterLibraries", { "WorkspaceId" : workspace_id, "ClusterId" : cluster_id } ) if show_all: return list_jobs_response["Libraries"] else: tmp_libraries=sorted(list_jobs_response["Libraries"],key=lambda x:str(x['LibraryType'])+x['LibraryName']+x['LibraryVersion']) libraries=[] for n in range(0,len(tmp_libraries)): i=tmp_libraries[n] if i['Status']=='INSTALLED' or i['Status']=='PENDING': libraries.append(i) elif i['Status'] == 'UNINSTALL_ON_RESTART' and len(libraries)>0 and libraries[-1]['LibraryType']==i['LibraryType'] and libraries[-1]['LibraryName']==i['LibraryName']: if libraries[-1]['Status']=='INSTALLED': libraries[-1]['Status']='PENDING' libraries.append(i) else: libraries[-1]=i return libraries
def create_update_event_hub_raw_data_capture( namespace_name: str, event_hub_name: str, datalake_name: str, datetime_partition_level: "DateTimeLevels" = DateTimeLevels.NA, partition_id_level: "PartitionByIdLevel" = PartitionIdLevels.NA, max_file_in_minutes: int = None, max_file_in_MB: int = None): endpoint = next( obj for obj in list_event_hubs(namespace_name) if obj["EventHubNamespace"] == namespace_name and obj["Name"] == event_hub_name and obj['EndpointTypeId'] == 2) datastore = neuro_call('80', 'datastoremanager', 'getdatastores', {"StoreName": datalake_name})['DataStores'][0] request = { 'EndPointId': endpoint['EndPointId'], 'DataStoreId': datastore['DataStoreId'], 'PartitionByDateTimeLevel': datetime_partition_level.value, 'PartitionByIdLevel': partition_id_level.value, 'FileTimeMinutesMax': max_file_in_minutes, 'FileSizeMBMax': max_file_in_MB } neuro_call_v2(service='endpointmanager', method='PutRawData', requestbody=request, controller="endpointmanagement")
def sql_to_df(store_name: str, sql_query: "sql_query", use_pyodbc=True): """ Execute a sql query and have the result put into a pandas dataframe in the notebook """ if use_pyodbc: connstrbits = neuro_call( '80', 'datastoremanager', 'GetDataStores', {'StoreName': store_name })['DataStores'][0]['ConnectionString'].split(';') server = connstrbits[0].split(':')[1].split(',')[0] database = connstrbits[1].split('=')[1] username = connstrbits[2].split('=')[1] password = connstrbits[3].split('=')[1] driver = '{ODBC Driver 13 for SQL Server}' with pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;DATABASE=' + database + ';UID=' + username + ';PWD=' + password) as cnxn: with cnxn.cursor() as cursor: return pandas.read_sql(build_sql(sql_query), cnxn) else: if not os.path.exists(home_directory() + "/tmp"): os.makedirs(home_directory() + "/tmp") file_name = str(uuid.uuid4()) + ".csv" count = len(os.getcwd().replace(home_directory(), "").split('/')) - 1 backs = "" for c in range(0, count): backs += "../" sql_to_csv(store_name, sql_query, backs + "tmp/" + file_name) df = pandas.read_csv(home_directory() + "/" + "tmp/" + file_name) os.remove(home_directory() + "/" + "tmp/" + file_name) return df
def submit_job(job_name: str, pyspark_script: str, script_parameters: "List[script_parameter]" = None, import_tables: "List[import_table]" = None, export_tables: "List[export_table]" = None, dependencies: "List[library]" = None, workspace_id: str = None, cluster_id: str = None, run_retry: bool = False, max_concurrent_runs: int = None): """ Submit a spark job (template) and recieve back the JobId """ return neuro_call("80", "sparkmanager", "submitjob", { "JobName" : job_name, "Script" : pyspark_script, "ScriptLanguage" : 0, "ScriptParameters" : script_parameters, "ImportTables" : import_tables, "ExportTables" : export_tables, "WorkspaceId" : workspace_id, "ClusterId" : cluster_id, "RunRetry" : run_retry, "MaxConcurrentRuns" : max_concurrent_runs, "LibraryDependencies" : dependencies } )
def list_tables(store_name: str, table_name: str = '', schema_type: str = ''): """ List existing tables in a Neuroverse data store """ data_stores = neuro_call("80", "datastoremanager", "GetDataStores", {"StoreName": store_name})["DataStores"] if len(data_stores) == 0: raise Exception("Data store doesn't exist") table_defs = neuro_call("80", "DataPopulation", "GetTableInfos", {"DataStoreId": data_stores[0]["DataStoreId"]}) return [{ 'TableId': table['TableId'], 'TableName': table['TableName'], 'SchemaType': SCHEMA_TYPE_MAP_REV[table['TableTypeId']] } for table in table_defs['TableInfos']]
def get_view(store_name: str, view_name: str): """ Get a SQL view """ data_stores = neuro_call("80", "datastoremanager", "GetDataStores", {"StoreName": store_name})["DataStores"] if len(data_stores) == 0: raise Exception("Data store doesn't exist") response = neuro_call("80", "datapopulation", "GetDataPopulationView", { "DataStoreId": data_stores[0]["DataStoreId"], "Name": view_name }, controller="DataPopulationView") return {"Name": response["Name"], "Query": response["Query"]}
def destroy_context(context_id: str): """ Destroy an interactive spark context """ destroy_context_response = neuro_call("80", "sparkmanager", "DestroyContext", { "ContextId" : context_id } )
def cancel_command(command_id: str): """ Cancel a running command in a context """ cancel_command_response = neuro_call("80", "sparkmanager", "CancelCommand", { "CommandId" : command_id } )
def cancel_run(run_id: str): """ Cancel a running instance of a job """ cancel_run_response = neuro_call("80", "sparkmanager", "CancelRun", { "RunId" : run_id } )
def list_datalake_table_files_with_partitions(store_name: str, table_name: str): """ List all the files associated with a datalake file in Neuroverse """ request = {"DataStoreName" : store_name, "TableName" : table_name} files = neuro_call("80", "DataMovementService", "ListDataLakeTableFiles", request)["Files"] return_list = [] for file in files: return_list.append(file.split(table_name.lower())[1]) return return_list
def delete_cluster(cluster_id: str = None, workspace_id: str = None): """ Delete a cluster """ delete_cluster_response = neuro_call("80", "sparkmanager", "DeleteCluster", { "ClusterId" : cluster_id, "WorkspaceId" : workspace_id } )
def start_cluster(cluster_id: str = None, workspace_id: str = None): """ Start a cluster """ start_cluster_response = neuro_call("80", "sparkmanager", "StartCluster", { "ClusterId" : cluster_id, "WorkspaceId" : workspace_id } )
def get_job_details(job_id: str): """ Get details about a submitted job """ get_job_details_response = neuro_call("80", "sparkmanager", "getjobdetails", { "JobId":job_id } ) return get_job_details_response["JobDetails"]
def restart_cluster(cluster_id: str = None, workspace_id: str = None): """ Restart a cluster Useful for downgrading libraries """ restart_cluster_response = neuro_call("80", "sparkmanager", "RestartCluster", { "ClusterId" : cluster_id, "WorkspaceId" : workspace_id } )
def list_jobs(workspace_id: str = None, cluster_id: str = None, max_returned: int = None): """ List the jobs submitted to spark manager """ list_jobs_response = neuro_call("80", "sparkmanager", "listjobs", { "WorkspaceId" : workspace_id, "ClusterId" : cluster_id, "NumberReturned" : max_returned } ) return list_jobs_response["JobSummaries"]
def inspect_command(command_id: str): """ Inspect the status and result of a command """ inspect_command_response = neuro_call("80", "sparkmanager", "InspectCommand", { "CommandId" : command_id } ) del inspect_command_response['Error'] del inspect_command_response['ErrorCode'] return inspect_command_response
def list_commands(context_id: str): """ List commands in a context """ list_commands_response = neuro_call("80", "sparkmanager", "ListCommands", { "ContextId" : context_id } ) del list_commands_response['Error'] del list_commands_response['ErrorCode'] return list_commands_response
def inspect_context(context_id: str): """ Inspect status of an interactive spark context """ inspect_context_response = neuro_call("80", "sparkmanager", "InspectContext", { "ContextId" : context_id } ) del inspect_context_response['Error'] del inspect_context_response['ErrorCode'] return inspect_context_response