def loadNoisyAnswers(self,
                         application_id: str,
                         postfix: str = "") -> __NodeDict__:
        """
        Load pickled noisy geonodes for all levels
        :param application_id: Unique DAS run ID which is part of filenames to load
        :param postfix: postfix to add to default filename (e.g. "_ms" for minimal_schema run)
        :return: by-geolevel dictionary of noisy nodes RDDs
        """
        spark = SparkSession.builder.getOrCreate()

        # Only load bottom level if no budgets spent on higher levels (like in bottom-up)
        levels2load = self.levels if self.geolevel_prop_budgets is not None else self.levels[:
                                                                                             1]

        nodes_dict = {}
        for level in levels2load:
            path = self.noisyPath(application_id, level, postfix)
            if path.startswith(C.HDFS_PREFIX):
                level_rdd = spark.sparkContext.pickleFile(
                    das_utils.expandPathRemoveHdfs(path))
            elif das_utils.isS3Path(path):
                level_rdd = spark.sparkContext.pickleFile(path)
            else:
                level_rdd = spark.sparkContext.parallelize(pickle.load(path))
            nodes_dict[level] = level_rdd if self.use_spark else RDDLikeList(
                level_rdd.collect())

        return nodes_dict
Ejemplo n.º 2
0
def saveRunData(path, config=None, feas_dict=None, rdd=None, batchSize=10):
    if path[-1] == '/':
        path = path[0:-1]

    # needed when not an s3 path, as the with open context assumes the folder already exists
    if not das_utils.isS3Path(path):
        das_utils.makePath(path)

    if config is not None:
        config_path = path + "/config.ini"
        logging.debug("Saving config to directory: {}".format(config_path))
        das_utils.saveConfigFile(config_path, config)

    if rdd is not None:
        logging.debug("Pickle Batch Size: {}".format(batchSize))
        data_path = path + "/data"
        logging.debug("Saving data to directory: {}".format(data_path))
        das_utils.savePickledRDD(data_path, rdd, batchSize=batchSize)

    if feas_dict is not None:
        for key in feas_dict.keys():
            feas_dict[key] = feas_dict[
                key].value  #this seems redundant, but is actually needed for the accumulator
        logging.info("Feasibility dictionary: {}".format(feas_dict))
        feas_path = path + "/feas_dict.json"
        logging.debug("Saving feas_dict to directory: {}".format(feas_path))
        das_utils.saveJSONFile(feas_path, feas_dict)
Ejemplo n.º 3
0
    def saveRunData(self, path, feas_dict=None, rdd=None):
        self.annotate("saveRunData", verbose=True)
        if path[-1] == '/':
            path = path[0:-1]

        # RDD must be saved first, because it needs an empty prefix.
        if rdd is not None:
            output_datafile_name = os.path.join(path, self.output_datafname)

            if self.overwrite_flag:
                das_utils.clearPath(output_datafile_name)

            # needed when not an s3 path, as the with open context assumes the folder already exists
            if not das_utils.isS3Path(output_datafile_name):
                das_utils.makePath(output_datafile_name)

            output_metadata_file_name = output_datafile_name + "/0_metadata"  # sorts before 'p'
            output_header_file_name = output_datafile_name + "/1_header"  # sorts before 'p' but after '1'
            self.annotate(f"writing RDD to {output_datafile_name}")
            self.saveRDD(output_datafile_name, rdd)

            if self.write_metadata:
                now = datetime.datetime.now().isoformat()
                self.saveMetadata(path=output_metadata_file_name,
                                  now=now,
                                  count=rdd.count())
                self.saveHeader(path=output_header_file_name)

            if self.s3cat:
                self.annotate(f"combining {output_datafile_name} with s3cat")
                s3cat.s3cat(output_datafile_name,
                            demand_success=True,
                            suffix=self.s3cat_suffix,
                            verbose=self.s3cat_verbose)
                self.add_output_path(output_datafile_name + self.s3cat_suffix)
            else:
                self.add_output_path(output_datafile_name)

        config_path = os.path.join(path, C.CONFIG_INI)
        self.annotate("Saving config to directory: {}".format(config_path))
        das_utils.saveConfigFile(config_path, self.config)

        if feas_dict is not None:
            for key in feas_dict.keys():
                if hasattr(feas_dict[key], 'value'):
                    feas_dict[key] = feas_dict[
                        key].value  # this seems redundant, but is actually needed for the accumulator
            self.log_and_print(f"Feasibility dictionary: {feas_dict}")
            feas_path = os.path.join(path, C.FEAS_DICT_JSON)
            self.annotate(f"Saving feas_dict to directory: {feas_path}")
            das_utils.saveJSONFile(feas_path, feas_dict)
Ejemplo n.º 4
0
    def saveRunData(self, path, feas_dict=None, rdd=None):
        self.annotate("saveRunData", verbose=True)
        if path[-1] == '/':
            path = path[0:-1]

        # RDD must be saved first, because it needs an empty prefix.
        if rdd is not None:
            output_datafile_name      = os.path.join(path, self.output_datafname)

            if self.overwrite_flag:
                das_utils.clearPath(output_datafile_name)

            # needed when not an s3 path, as the with open context assumes the folder already exists
            if not das_utils.isS3Path(output_datafile_name):
                das_utils.makePath(output_datafile_name)

            output_metadata_file_name = output_datafile_name+"/0_metadata"  # sorts before 'p'
            output_header_file_name   = output_datafile_name+"/1_header"    # sorts before 'p' but after '1'
            self.annotate(f"writing RDD to {output_datafile_name}")
            self.saveRDD(output_datafile_name, rdd)

            if self.write_metadata:
                now = datetime.datetime.now().isoformat()
                self.saveMetadata(path=output_metadata_file_name, now=now, count=rdd.count())
                self.saveHeader(path=output_header_file_name)

            if self.s3cat:
                # If we combine the data with s3cat
                # note the combined filename in the annotated output, the DFXML file, the DVS object, and do it.

                self.annotate(f"combining {output_datafile_name} with s3cat")

                # Record this with DFXML
                ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_S3CAT,
                              {'output_datafile_name':output_datafile_name,
                               'demand_success':'True',
                               'suffix':self.s3cat_suffix,
                               'verbose':str(self.s3cat_verbose)})

                self.add_output_path(output_datafile_name + self.s3cat_suffix)
                s3cat.s3_cat(output_datafile_name)
            else:
                # Otherwise just note the prefix in DFS and DFXML
                ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_OUTPUT).text=output_datafile_name+"/"
                self.add_output_path(output_datafile_name + "/")


        config_path = os.path.join(path, f"{self.output_datafname}_{CC.CONFIG_INI}")

        self.annotate("Saving the flattened config to directory: {}".format(config_path))
        das_utils.saveConfigFile(config_path, self.config)
        f = io.StringIO()
        self.config.write(f)
        ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_CONFIG).text = f.getvalue()


        if feas_dict is not None:
            for key in feas_dict.keys():
                if hasattr(feas_dict[key], 'value'):
                    feas_dict[key] = feas_dict[key].value  # this seems redundant, but is actually needed for the accumulator
            self.log_and_print(f"Feasibility dictionary: {feas_dict}")
            feas_path = os.path.join(path, f"{self.output_datafname}_{CC.FEAS_DICT_JSON}")
            self.annotate(f"Saving feas_dict to directory: {feas_path}")
            das_utils.saveJSONFile(feas_path, feas_dict)