Exemple #1
0
    def __init__(self, usr, passwd, gdhost="https://secure.gooddata.com"):
        """
        Initialization of GoodDataLogin object - mandatory arguments are Gooddata login and password.
        For another host than "secure" (white-label solution) use gdhost parameter.
        """
        self.gdhost = gdhost
        self.usr = usr
        self.passwd = passwd

        login_json = json.loads(GoodDataLogin.login_json_template)
        login_json["postUserLogin"]["login"] = usr
        login_json["postUserLogin"]["password"] = passwd

        # step 1 - /gdc/account/login
        try:
            headers = gd.http_headers_template.copy()
            url = self.gdhost + "/gdc/account/login"
            if gdhost.startswith("https://"):
                url = self.gdhost + "/gdc/account/login"
            else:
                raise ValueError("https:// not specified in {}".format(url))

            request = urllib2.Request(url,
                                      data=json.dumps(login_json),
                                      headers=headers)
            logger.debug(gd.request_info(request))
            response = urllib2.urlopen(request)

        except ValueError as e:
            logger.error(e, exc_info=True)
            raise gd.GoodDataError("Problem with url", e)
        except urllib2.HTTPError as e:
            logger.error(e, exc_info=True)
            if e.code == 401:
                raise gd.GoodDataAPIError(
                    url, e, msg="Problem with login to GoodData platform")
            elif e.code == 404:
                raise gd.GoodDataAPIError(
                    url, e, msg="Problem with GoodData host or resource")
            else:
                raise Exception(e)
        except urllib2.URLError as e:
            logger.error(e, exc_info=True)
            raise gd.GoodDataAPIError(url,
                                      e,
                                      msg="Problem with url for GoodData host")

        # processing account/login response
        api_response = gd.check_response(response)
        logger.debug(gd.response_info(api_response))

        account_login_response_json = json.loads(api_response["body"])
        self.login_profile = account_login_response_json["userLogin"]["state"]
        self.super_secured_token = account_login_response_json["userLogin"][
            "token"]
        # temporary token is returned in API response as X-GDC-AuthTT header
        self.temporary_token = api_response["info"]["X-GDC-AuthTT"]

        response.close()
Exemple #2
0
    def logout(self):
        """Performs logout from GoodData - SST token is destroyed"""
        headers = {
            'Accept': 'application/json',
            'X-GDC-AuthSST': self.super_secured_token,
            'X-GDC-AuthTT': self.generate_temporary_token()
        }

        url = "https://secure.gooddata.com" + self.login_profile
        request = urllib2.Request(url, headers=headers)
        request.get_method = lambda: 'DELETE'

        try:
            logger.debug(gd.request_info(request))
            response = urllib2.urlopen(request)
        except urllib2.HTTPError as e:
            logger.warning(e, exc_info=True)
            raise gd.GoodDataAPIError(url, e, msg="Problem during logout")

        # processing logout response
        api_response = gd.check_response(response)
        logger.debug(gd.response_info(api_response))

        self.super_secured_token = ""
        self.temporary_token = ""

        response.close()
Exemple #3
0
    def generate_temporary_token(self):
        """
        Function returns a temporary token which is required for any call to GoodData API.
        Temporary token is valid only short period of time (usually 10 minutes) so
        use the function also in case that 401 http code is returned after polling.
        """
        headers = gd.http_headers_template.copy()
        headers["X-GDC-AuthSST"] = self.super_secured_token
        url = self.gdhost + "/gdc/account/token"
        request = urllib2.Request(url, headers=headers)

        try:
            logger.debug(gd.request_info(request))
            response = urllib2.urlopen(request)
        except urllib2.HTTPError as e:
            if e.code == 401:
                """ we shouldn't receive unauthorized (bad user/pass) here - this is handled in __init__ 
                - most probably it means that SST is no longer valid or logout() had been called
                - reinitialize of instance needed """
                logger.debug(
                    "* 401 response caught after TT call - calling for valid SST."
                )
                GoodDataLogin.__init__(self, self.usr, self.passwd,
                                       self.gdhost)
            else:
                logger.error(e, exc_info=True)
                raise gd.GoodDataAPIError(
                    url, e, msg="Problem during obtaining of temporary token")
        else:
            # processing account/token response only in case that no exception has been caught
            api_response = gd.check_response(response)
            logger.debug(gd.response_info(api_response))
            response.close()

            self.temporary_token = json.loads(
                api_response["body"])["userToken"]["token"]

        # token is set correctly here because even in case of 401 error is set in __init__()
        return self.temporary_token
Exemple #4
0
    def prepare_upload(self):
        """ This function downloads manifest/s for given list of datasets from GoodData API
        1) modify names for csv columns to more human readable names
        2) save manifests in etl working directory in project/manifest
        3) save template csv with headers in etl working directory in project/csv
        4) creates final upload_info.json - in case of more datasets it creates batch mode manifest
        """
        # according to datasets we will download and consequently modify manifests for specified dataset
        headers = gd.http_headers_template.copy()
        headers["X-GDC-AuthTT"] = self.glo.generate_temporary_token()

        logger.debug("Preparing metadata for following datasets:\n{}".format(self.datasets))
        for dataset in self.datasets.keys():
            url = self.glo.gdhost + "/gdc/md/" + self.project + "/ldm/singleloadinterface/dataset." + dataset + "/manifest"
            request = urllib2.Request(url, headers=headers)

            try:
                logger.debug(gd.request_info(request))
                response = urllib2.urlopen(request)
            except urllib2.HTTPError as e:
                logger.error(e, exc_info=True)
                # for 40x errors we don't retry
                if e.code == 403 or e.code == 404:
                    raise gd.GoodDataAPIError(url, e, msg="Problem during retrieving a manifest")
                else:
                    raise Exception(e)
                    # to do take a look at reason and in case of need check /gdc/ping anf if everything OK try retry

            # processing of individual singleloadinterface/dataset response
            api_response = gd.check_response(response)
            logger.debug(gd.response_info(api_response))
            manifest_json = json.loads(api_response["body"])

            p = 0
            csv_header_template = []
            for m_column in manifest_json["dataSetSLIManifest"]["parts"]:
                # for date we use in name date format name(yyyy-MM-dd)
                if str(m_column["columnName"]).find(".dt_") > -1:
                    human_readable_name = str(m_column["columnName"]).split("_")[-2] + "(" + \
                                          m_column["constraints"]["date"] + ")"
                else:  # for other attributes we will pick name after last _
                    human_readable_name = str(m_column["columnName"]).split("_")[-1]

                # instead of originally generated names we will use human readable name of attribute
                manifest_json["dataSetSLIManifest"]["parts"][p]["columnName"] = human_readable_name
                csv_header_template.append(human_readable_name)
                # update etl mode
                manifest_json["dataSetSLIManifest"]["parts"][p]["mode"] = self.datasets[dataset]
                p += 1

            # name of csv file is changed within manifest
            manifest_json["dataSetSLIManifest"]["file"] = dataset + ".csv"
            # a custom mode field appended to manifest for better handling upload modes
            manifest_json["dataSetSLIManifest"]["mode"] = self.datasets[dataset]

            # save each csv_header_template to list for later usage
            csv_header_template.sort()
            self.csv_header_templates.append(csv_header_template)
            # storing each manifest in list for later usage
            self.manifests.append(manifest_json)

        try:
            i = 0
            # writing manifests and csv templates files to etl working directory
            for dtset in self.datasets.keys():
                # write to file in dir manifests in etl working directory
                with open(os.path.join(self.wd, self.project, "manifests", dtset + ".json"), "w") as f:
                    f.write(json.dumps(self.manifests[i], sort_keys=True, indent=2, separators=(',', ': ')))
                # write csv file with template header to csv dir
                with open(os.path.join(self.wd, self.project, "csv", dtset + "_header.csv"), "w") as f:
                    pom = ""
                    for attr in self.csv_header_templates[i]:
                        pom += '"{}",'.format(attr)
                    f.write(pom[0:-1])
                i += 1

            # create final upload_info.json
            if len(self.datasets.keys()) > 1:  # we create SLI BATCH manifest
                upload_info_json = {"dataSetSLIManifestList": []}
                for manifest in self.manifests:
                    upload_info_json["dataSetSLIManifestList"].append(manifest)
            else:  # single manifest
                upload_info_json = dict(self.manifests[0])

            # write final upload_info.json to manifests directory
            with open(os.path.join(self.wd, self.project, "manifests", "upload_info.json"), "w") as f:
                f.write(json.dumps(upload_info_json, sort_keys=True, indent=2, separators=(',', ': ')))

        except OSError as e:
            logger.error(e, exc_info=True)
            raise gd.GoodDataError("Problem with etl working directory", e)
        except IOError as e:
            logger.error(e, exc_info=True)
            raise gd.GoodDataError("Problem during file operation", e)
Exemple #5
0
    def perform_project_load(self):

        headers = gd.http_headers_template.copy()
        headers["X-GDC-AuthTT"] = self.glo.generate_temporary_token()
        url = self.glo.gdhost + "/gdc/md/" + self.project + "/etl/pull2"
        etl_json = {"pullIntegration": self.project + "/" + self.remote_etl_dir}
        request = urllib2.Request(url, headers=headers, data=unicode(json.dumps(etl_json)))

        try:
            logger.debug(gd.request_info(request))
            response = urllib2.urlopen(request)
        except urllib2.HTTPError as e:
            # for 40x errors we don't retry
            if e.code == 404:
                logger.error(e, exc_info=True)
                raise gd.GoodDataAPIError(url, e, msg="GoodData project doesn't exist")
            elif e.code == 403:
                logger.error(e, exc_info=True)
                raise gd.GoodDataAPIError(url, e, msg="Admin or editor role is required")
            else:
                raise Exception(e)
                # to do take a look at reason and in case of need check /gdc/ping anf if everything OK try retry

        # 201 created - processing of etl/pull2 response
        api_response = gd.check_response(response)
        logger.debug(gd.response_info(api_response))
        poll_url = json.loads(api_response["body"])["pull2Task"]["links"]["poll"]

        # poll for full result
        url = self.glo.gdhost + poll_url
        etl_task_state = "RUNNING"

        logger.debug("Polling '{}' for final result".format(url))
        retry = 0
        while etl_task_state == "RUNNING":
            # current request
            request = urllib2.Request(url, headers=headers)
            try:
                response = urllib2.urlopen(request)
            except urllib2.HTTPError as e:
                if e.code == 401:
                    # it seems that a new temporary token should be generated and we need to repeat poll request
                    logger.debug("* 401 response on state of ETL task - calling for valid TT.")
                    headers["X-GDC-AuthTT"] = gl.generate_temporary_token()
                    continue
                else:
                    retry += 1
                    if retry == 4:
                        emsg = "Problem during call for state of ETL task"
                        logger.error("{}: {}".format(emsg, e))
                        raise gd.GoodDataAPIError(url, e, emsg)
                    else:
                        logger.warning(
                            "* {} response on state of ETL task - retry({}):\n{}".format(e.code, retry, e.read()))
            else:
                # poll response
                response_body = response.read()
                etl_task_state = json.loads(response_body)["wTaskStatus"]["status"]

            # don't spam wait some time
            time.sleep(3)

        # ETL finished - returning state and save text result message
        if etl_task_state == "ERROR":
            err_params = json.loads(response_body)["wTaskStatus"]["messages"][0]["error"]["parameters"]
            err_message = json.loads(response_body)["wTaskStatus"]["messages"][0]["error"]["message"]
            logger.error(err_message % (tuple(err_params)))
            self.etl_task_result = "ERROR" + err_message % (tuple(err_params))
            etl_ok = False
        elif etl_task_state == "OK":
            self.etl_task_result = etl_task_state
            etl_ok = True
        else:  # CANCELED ?
            self.etl_task_result = etl_task_state
            etl_ok = False

        logger.info("ETL has been finished with following state:\n{}".format(self.etl_task_result))
        return etl_ok
Exemple #6
0
    def perform_data_upload(self):
        """
        This method performs upload to user staging directory (WebDav)
        As this method can be also called directly(without calling prepare_upload() after creating GoodDataETL instance
        we have to check that all necessary files are in place.
        """
        if not self.datasets:
            emsg = "Error: You MUST specify datatests and upload modes - add datasets and run preparation phase again"
            logger.error("{}".format(emsg))
            raise gd.GoodDataError(emsg)

        # compare headers of csv files for upload with template csv files
        try:
            for dataset in self.datasets.keys():
                header_template_file = os.path.join(self.wd, self.project, "csv", dataset + "_header.csv")
                with open(header_template_file, "r") as f:
                    reader = csv.reader(f)
                    # csv header from template file
                    header_template = reader.next()
                header_csv_file = os.path.join(self.wd, self.project, "csv", dataset + ".csv")
                with open(header_csv_file, "r") as f:
                    reader = csv.reader(f)
                    # csv header from data file
                    header_csv = reader.next()

                header_template.sort()
                header_csv.sort()

                if header_template != header_csv:
                    raise gd.GoodDataError("Error: Header of template file and csv file for upload doesn't match")

        except IOError as e:
            emsg = "Problem during comparing csv headers - check that csv files are in csv folder within ETL working directory"
            logger.error("{}: {}".format(emsg, e))
            raise gd.GoodDataError("Error: {}".format(emsg), e)
        except gd.GoodDataError as e:
            emsg = "{}\n{}:\n{}\n{}:\n{}\nFile '{}' MUST contain same columns as file '{}' (order doesn't matter)".format(
                e,
                os.path.basename(header_template_file), header_template, os.path.basename(header_csv_file), header_csv,
                os.path.basename(header_csv_file), os.path.basename(header_template_file))
            logger.error(emsg)
            raise gd.GoodDataError("Error: {}".format(emsg))
        except Exception as e:
            emsg = "Unexpected problem during comparing csv headers"
            logger.error(emsg, exc_info=True)
            raise gd.GoodDataError("Error: {}".format(emsg), traceback.print_exc())

        # creating upload.zip
        try:
            # list of files for upload.zip
            files = []
            # adding csv files with data for upload
            for dataset in self.datasets.keys():
                files.append(os.path.join(self.wd, self.project, "csv", dataset + ".csv"))
            # adding current manifest
            files.append(os.path.join(self.wd, self.project, "manifests", "upload_info.json"))

            zf = zipfile.ZipFile(os.path.join(self.wd, self.project, "upload.zip"), "w", zipfile.ZIP_DEFLATED)
            for f in files:
                zf.write(f, os.path.basename(f))
            zf.close()

            zf = zipfile.ZipFile(os.path.join(self.wd, self.project, "upload.zip"))
            with open(os.path.join(self.wd, self.project, "upload.txt"), "w") as f:
                for info in zf.infolist():
                    f.write("{}\n".format(info.filename))
                    f.write("\tModified:\t{}\n".format(datetime.datetime(*info.date_time)))
                    f.write("\tCompressed:\t{} bytes\n".format(info.compress_size))
                    f.write("\tUncompressed:\t{} bytes\n".format(info.file_size))
        except Exception as e:
            emsg = "Problem during creating upload.zip - check that all source files for upload are in csv directory"
            logger.error(emsg, exc_info=True)
            raise gd.GoodDataError("Error: {}".format(emsg), traceback.print_exc())

        # upload to WebDav
        upload_zip_size = os.path.getsize(os.path.join(self.wd, self.project, "upload.zip"))
        with open(os.path.join(self.wd, self.project, "upload.zip"), "rb") as f:
            headers = gd.http_headers_template.copy()
            headers["X-GDC-AuthTT"] = self.glo.generate_temporary_token()
            headers["Content-Type"] = "application/zip"
            headers["Content-Length"] = upload_zip_size

            self.remote_etl_dir = uuid.uuid4().hex
            url = "https://secure-di.gooddata.com/uploads/{}/{}/upload.zip".format(self.project, self.remote_etl_dir)
            request = urllib2.Request(url, headers=headers, data=f.read())
            request.get_method = lambda: 'PUT'

            try:
                logger.debug(gd.request_info(request))
                start_time = time.time()
                response = urllib2.urlopen(request)
                total_time_sec = time.time() - start_time
            except urllib2.HTTPError as e:
                logger.error(e)
                raise gd.GoodDataAPIError(url, e, msg="Problem during upload to GoodData WebDAV")
                # TODO: take a look at reason and in case of need check /gdc/ping anf if everything OK try retry
            except Exception as e:
                logger.error(e)
                raise Exception(e)

        with open(os.path.join(self.wd, self.project, "uploaded.to"), "w") as f:
            f.write("{}\n".format(url))

        # processing WebDav response
        api_response = gd.check_response(response)
        logger.debug(gd.response_info(api_response))
        logger.debug("File uploaded to WebDav in {}s".format(round(total_time_sec, 2)))

        response.close()