API_KEY = os.getenv("CDSW_API_KEY") PROJECT_NAME = os.getenv("CDSW_PROJECT") # Instantiate API Wrapper cml = CMLBootstrap(HOST, USERNAME, API_KEY, PROJECT_NAME) # Set the STORAGE environment variable try : storage=os.environ["STORAGE"] except: tree = ET.parse('/etc/hadoop/conf/hive-site.xml') root = tree.getroot() for prop in root.findall('property'): if prop.find('name').text == "hive.metastore.warehouse.dir": storage = prop.find('value').text.split("/")[0] + "//" + prop.find('value').text.split("/")[2] storage_environment_params = {"STORAGE":storage} storage_environment = cml.create_environment_variable(storage_environment_params) os.environ["STORAGE"] = storage if os.environ["STORAGE"] == "//tablespace": os.environ["STORAGE"] = "/tmp" # Upload the data to the cloud storage !hdfs dfs -mkdir -p $STORAGE/datalake !hdfs dfs -mkdir -p $STORAGE/datalake/data !hdfs dfs -mkdir -p $STORAGE/datalake/data/churn !hdfs dfs -copyFromLocal /home/cdsw/raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv $STORAGE/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv
storage = os.environ["STORAGE"] except: if os.path.exists("/etc/hadoop/conf/hive-site.xml"): tree = ET.parse("/etc/hadoop/conf/hive-site.xml") root = tree.getroot() for prop in root.findall("property"): if prop.find("name").text == "hive.metastore.warehouse.dir": storage = ( prop.find("value").text.split("/")[0] + "//" + prop.find("value").text.split("/")[2] ) else: storage = "/user/" + os.getenv("HADOOP_USER_NAME") storage_environment_params = {"STORAGE": storage} storage_environment = cml.create_environment_variable(storage_environment_params) os.environ["STORAGE"] = storage # define a function to run commands on HDFS def run_cmd(cmd, raise_err=True): """ Run Linux commands using Python's subprocess module Args: cmd (str) - Linux command to run Returns: process """ print("Running system command: {0}".format(cmd))