Ejemplo n.º 1
0
    def curate_data(self, run_settings, location, experiment_id):
        logger.debug("vasp curate_data")
        logger.debug('location=%s' % location)
        bdp_username = getval(
            run_settings, '%s/bdp_userprofile/username' % self.SCHEMA_PREFIX)
        mytardis_url = run_settings[
            'http://rmit.edu.au/schemas/input/mytardis']['mytardis_platform']
        mytardis_settings = manage.get_platform_settings(
            mytardis_url, bdp_username)
        logger.debug(mytardis_settings)

        def _get_exp_name_for_input(path):
            return str(os.sep.join(path.split(os.sep)[-2:]))

        ename = _get_exp_name_for_input(location)

        experiment_id = mytardis.create_experiment(
            settings=mytardis_settings,
            exp_id=experiment_id,
            expname=ename,
            experiment_paramset=[
                mytardis.create_paramset("remotemake", []),
                mytardis.create_graph_paramset(
                    "expgraph",
                    name="makeexp1",
                    graph_info={
                        "axes": ["num_kp", "energy"],
                        "legends": ["TOTEN"]
                    },
                    value_dict={},
                    value_keys=[["makedset/num_kp", "makedset/toten"]]),
                mytardis.create_graph_paramset(
                    "expgraph",
                    name="makeexp2",
                    graph_info={
                        "axes": ["encut", "energy"],
                        "legends": ["TOTEN"]
                    },
                    value_dict={},
                    value_keys=[["makedset/encut", "makedset/toten"]]),
                mytardis.create_graph_paramset(
                    "expgraph",
                    name="makeexp3",
                    graph_info={
                        "axes": ["num_kp", "encut", "TOTEN"],
                        "legends": ["TOTEN"]
                    },
                    value_dict={},
                    value_keys=[[
                        "makedset/num_kp", "makedset/encut", "makedset/toten"
                    ]]),
            ])
        return experiment_id
Ejemplo n.º 2
0
    def curate_data(self, run_settings, location, experiment_id):

        logger.debug("vasp durate_data")
        try:
            subdirective = getval(run_settings, '%s/stages/sweep/directive' % SCHEMA_PREFIX)
        except SettingNotFoundException:
            logger.warn("cannot find subdirective name")
            subdirective = ''

        if subdirective == "vasp":

            bdp_username = getval(run_settings, '%s/bdp_userprofile/username' % SCHEMA_PREFIX)
            mytardis_url = run_settings['http://rmit.edu.au/schemas/input/mytardis']['mytardis_platform']
            mytardis_settings = manage.get_platform_settings(mytardis_url, bdp_username)
            logger.debug(mytardis_settings)

            def _get_exp_name_for_input(path):
                return str(os.sep.join(path.split(os.sep)[-2:]))

            ename = _get_exp_name_for_input(location)

            experiment_id = mytardis.create_experiment(
                    settings=mytardis_settings,
                    exp_id=experiment_id,
                    expname=ename,
                    experiment_paramset=[
                    mytardis.create_paramset("remotemake", []),
                    mytardis.create_graph_paramset("expgraph",
                        name="makeexp1",
                        graph_info={"axes":["num_kp", "energy"], "legends":["TOTEN"]},
                        value_dict={},
                        value_keys=[["makedset/num_kp", "makedset/toten"]]),
                    mytardis.create_graph_paramset("expgraph",
                        name="makeexp2",
                        graph_info={"axes":["encut", "energy"], "legends":["TOTEN"]},
                        value_dict={},
                        value_keys=[["makedset/encut", "makedset/toten"]]),
                    mytardis.create_graph_paramset("expgraph",
                        name="makeexp3",
                        graph_info={"axes":["num_kp", "encut", "TOTEN"], "legends":["TOTEN"]},
                        value_dict={},
                    value_keys=[["makedset/num_kp", "makedset/encut", "makedset/toten"]]),
                ])

        else:
            logger.warn("cannot find subdirective name")

        return experiment_id
Ejemplo n.º 3
0
    def curate_data(self, run_settings, output_location, experiment_id):
        '''
           Creates experiment in MyTardis
        '''
        # Loading MyTardis credentials
        bdp_username = getval(run_settings, '%s/bdp_userprofile/username' % SCHEMA_PREFIX)
        mytardis_url = getval(run_settings, '%s/input/mytardis/mytardis_platform' % SCHEMA_PREFIX)
        mytardis_settings = manage.get_platform_settings(mytardis_url, bdp_username)

        def _get_experiment_name(path):
            '''
                Return the name for MyTardis experiment
                e.g., if path='x/y/z', returns 'y/z'
            '''
            return str(os.sep.join(path.split(os.sep)[-2:]))

        # Creates new experiment if experiment_id=0
        # If experiment_id is non-zero, the experiment is updated
        experiment_id = mytardis.create_experiment(
            settings=mytardis_settings, # MyTardis credentials
            exp_id=experiment_id,
            expname=_get_experiment_name(output_location), # name of the experiment in MyTardis
            # metadata associated with the experiment
            # a list of parameter sets
            experiment_paramset=[
                # a new blank parameter set conforming to schema 'remotemake'
                mytardis.create_paramset("remotemake", []),
                # a graph parameter set
                mytardis.create_graph_paramset("expgraph", # name of schema
                    name="randexp1", # unique graph name
                    graph_info={"axes":["x", "y"], "legends":["Random points"]}, # information about the graph
                    value_dict={}, # values to be used in parent graphs if appropriate
                    value_keys=[["randdset/x", "randdset/y"]]), # values from datasets to produce points in the graph
                           ])
        return experiment_id
Ejemplo n.º 4
0
    def curate_data(self, run_settings, location, experiment_id):
        bdp_username = run_settings['http://rmit.edu.au/schemas/bdp_userprofile']['username']

        curate_data = run_settings['http://rmit.edu.au/schemas/input/mytardis']['curate_data']
        if curate_data:
            mytardis_url = run_settings['http://rmit.edu.au/schemas/input/mytardis']['mytardis_platform']
            mytardis_settings = manage.get_platform_settings(mytardis_url, bdp_username)
            logger.debug(mytardis_settings)

            EXP_DATASET_NAME_SPLIT = 2

            def _get_exp_name_for_input(path):
                return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

            logger.debug("location=%s" % location)
            ename = _get_exp_name_for_input(location)
            logger.debug("ename=%s" % ename)
            experiment_id = mytardis.create_experiment(
                settings=mytardis_settings,
                exp_id=experiment_id,
                expname=ename,
                experiment_paramset=[
                    mytardis.create_paramset("hrmcexp", []),
                    mytardis.create_graph_paramset("expgraph",
                        name="hrmcexp",
                        graph_info={"axes":["iteration", "criterion"], "legends":["criterion"], "precision":[0, 2]},
                        value_dict={},
                        value_keys=[["hrmcdset/it", "hrmcdset/crit"]])
            ])

        else:
            logger.warn('Data curation is off')
        return experiment_id
Ejemplo n.º 5
0
    def curate_data(self, run_settings, location, experiment_id):
        logger.debug("vasp curate_data")
        logger.debug('location=%s' % location)
        bdp_username = getval(run_settings, '%s/bdp_userprofile/username' % self.SCHEMA_PREFIX)
        mytardis_url = run_settings['http://rmit.edu.au/schemas/input/mytardis']['mytardis_platform']
        mytardis_settings = manage.get_platform_settings(mytardis_url, bdp_username)
        logger.debug(mytardis_settings)

        def _get_exp_name_for_input(path):
            return str(os.sep.join(path.split(os.sep)[-2:]))

        ename = _get_exp_name_for_input(location)

        experiment_id = mytardis.create_experiment(
                settings=mytardis_settings,
                exp_id=experiment_id,
                expname=ename,
                experiment_paramset=[
                mytardis.create_paramset("remotemake", []),
                mytardis.create_graph_paramset("expgraph",
                    name="makeexp1",
                    graph_info={"axes":["num_kp", "energy"], "legends":["TOTEN"]},
                    value_dict={},
                    value_keys=[["makedset/num_kp", "makedset/toten"]]),
                mytardis.create_graph_paramset("expgraph",
                    name="makeexp2",
                    graph_info={"axes":["encut", "energy"], "legends":["TOTEN"]},
                    value_dict={},
                    value_keys=[["makedset/encut", "makedset/toten"]]),
                mytardis.create_graph_paramset("expgraph",
                    name="makeexp3",
                    graph_info={"axes":["num_kp", "encut", "TOTEN"], "legends":["TOTEN"]},
                    value_dict={},
                value_keys=[["makedset/num_kp", "makedset/encut", "makedset/toten"]]),
            ])
        return experiment_id
Ejemplo n.º 6
0
    def curate_data(self, run_settings, output_location, experiment_id):
        '''
           Creates experiment in MyTardis
        '''
        # Loading MyTardis credentials
        bdp_username = getval(run_settings,
                              '%s/bdp_userprofile/username' % SCHEMA_PREFIX)
        mytardis_url = getval(
            run_settings,
            '%s/input/mytardis/mytardis_platform' % SCHEMA_PREFIX)
        mytardis_settings = manage.get_platform_settings(
            mytardis_url, bdp_username)

        def _get_experiment_name(path):
            '''
                Return the name for MyTardis experiment
                e.g., if path='x/y/z', returns 'y/z'
            '''
            return str(os.sep.join(path.split(os.sep)[-2:]))

        # Creates new experiment if experiment_id=0
        # If experiment_id is non-zero, the experiment is updated
        experiment_id = mytardis.create_experiment(
            settings=mytardis_settings,  # MyTardis credentials
            exp_id=experiment_id,
            expname=_get_experiment_name(
                output_location),  # name of the experiment in MyTardis
            # metadata associated with the experiment
            # a list of parameter sets
            experiment_paramset=[
                # a new blank parameter set conforming to schema 'remotemake'
                mytardis.create_paramset("remotemake", []),
                # a graph parameter set
                mytardis.create_graph_paramset(
                    "expgraph",  # name of schema
                    name="randexp1",  # unique graph name
                    graph_info={
                        "axes": ["x", "y"],
                        "legends": ["Random points"]
                    },  # information about the graph
                    value_dict=
                    {},  # values to be used in parent graphs if appropriate
                    value_keys=[[
                        "randdset/x", "randdset/y"
                    ]]),  # values from datasets to produce points in the graph
            ])
        return experiment_id
Ejemplo n.º 7
0
    def curate_data(self, run_settings, location, experiment_id):
        bdp_username = run_settings[
            'http://rmit.edu.au/schemas/bdp_userprofile']['username']

        curate_data = run_settings[
            'http://rmit.edu.au/schemas/input/mytardis']['curate_data']
        if curate_data:
            mytardis_url = run_settings[
                'http://rmit.edu.au/schemas/input/mytardis'][
                    'mytardis_platform']
            mytardis_settings = manage.get_platform_settings(
                mytardis_url, bdp_username)
            logger.debug(mytardis_settings)

            EXP_DATASET_NAME_SPLIT = 2

            def _get_exp_name_for_input(path):
                return str(
                    os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

            logger.debug("location=%s" % location)
            ename = _get_exp_name_for_input(location)
            logger.debug("ename=%s" % ename)
            experiment_id = mytardis.create_experiment(
                settings=mytardis_settings,
                exp_id=experiment_id,
                expname=ename,
                experiment_paramset=[
                    mytardis.create_paramset("hrmcexp", []),
                    mytardis.create_graph_paramset(
                        "expgraph",
                        name="hrmcexp",
                        graph_info={
                            "axes": ["iteration", "criterion"],
                            "legends": ["criterion"],
                            "precision": [0, 2]
                        },
                        value_dict={},
                        value_keys=[["hrmcdset/it", "hrmcdset/crit"]])
                ])

        else:
            logger.warn('Data curation is off')
        return experiment_id
Ejemplo n.º 8
0
    def curate_dataset(self, run_settings, experiment_id,
                       base_url, output_url, all_settings):
        '''
            Curates dataset
        '''
        # Retrieves process directories below the current output location
        iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        current_output_url = "%s%s" % (output_prefix, os.path.join(os.path.join(
            base_url, "output_%s" % iteration)))
        (scheme, host, current_output_path, location, query_settings) = storage.parse_bdpurl(output_url)
        output_fsys = storage.get_filesystem(output_url)
        process_output_dirs, _ = output_fsys.listdir(current_output_path)

        # Curates a dataset with metadata per process
        for i, process_output_dir in enumerate(process_output_dirs):
            # Expand the process output directory and add credentials for access
            process_output_url = '/'.join([current_output_url, process_output_dir])
            process_output_url_with_cred = get_url_with_credentials(
                    all_settings,
                    process_output_url,
                    is_relative_path=False)
            # Expand the process output file and add credentials for access
            output_file_url_with_cred = storage.get_url_with_credentials(
                all_settings, '/'.join([process_output_url, OUTPUT_FILE]),
                is_relative_path=False)
            try:
                output_content = storage.get_file(output_file_url_with_cred)
                val1, val2 = output_content.split()
            except (IndexError, IOError) as e:
                logger.warn(e)
                continue
            try:
                x = float(val1)
                y = float(val2)
            except (ValueError, IndexError) as e:
                logger.warn(e)
                continue

            # Returns the process id as MyTardis dataset name
            all_settings['graph_point_id'] = str(i)
            def _get_dataset_name(settings, url, path):
                return all_settings['graph_point_id']

            # Creates new dataset and adds to experiment
            # If experiment_id==0, creates new experiment
            experiment_id = mytardis.create_dataset(
                settings=all_settings, # MyTardis credentials
                source_url=process_output_url_with_cred,
                exp_id=experiment_id,
                dataset_name=_get_dataset_name, # the function that defines dataset name
                dataset_paramset=[
                    # a new blank parameter set conforming to schema 'remotemake/output'
                    mytardis.create_paramset("remotemake/output", []),
                    mytardis.create_graph_paramset("dsetgraph", # name of schema
                        name="randdset", # a unique dataset name
                        graph_info={},
                        value_dict={"randdset/x": x, "randdset/y": y},  # values to be used in experiment graphs
                        value_keys=[]
                        ),
                    ]
                )
        return experiment_id
Ejemplo n.º 9
0
                    self.experiment_id = mytardis.create_dataset(
                        settings=mytardis_settings,
                        source_url=encoded_d_url,
                        exp_id=self.experiment_id,
                        exp_name=_get_exp_name_for_vasp,
                        dataset_name=_get_dataset_name_for_vasp,
                        experiment_paramset=[],
                        dataset_paramset=[
                            mytardis.create_paramset("remotemake/output", []),
                            mytardis.create_graph_paramset(
                                "dsetgraph",
                                name="makedset",
                                graph_info={},
                                value_dict={
                                    "makedset/num_kp": num_kp,
                                    "makedset/encut": encut,
                                    "makedset/toten": toten
                                } if (num_kp is not None) and
                                (encut is not None) and
                                (toten is not None) else {},
                                value_keys=[]),
                        ])
                elif directive == "remotemake":

                    def _get_exp_name_for_make(settings, url, path):
                        return str(os.sep.join(path.split(os.sep)[-2:-1]))

                    def _get_dataset_name_for_make(settings, url, path):
                        return str(os.sep.join(path.split(os.sep)[-1:]))

                    self.experiment_id = mytardis.create_dataset(
Ejemplo n.º 10
0
    def curate_dataset(self, run_settings, experiment_id, base_url, output_url,
                       all_settings):
        '''
            Curates dataset
        '''
        # Retrieves process directories below the current output location
        iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        current_output_url = "%s%s" % (
            output_prefix,
            os.path.join(os.path.join(base_url, "output_%s" % iteration)))
        (scheme, host, current_output_path, location,
         query_settings) = storage.parse_bdpurl(output_url)
        output_fsys = storage.get_filesystem(output_url)
        process_output_dirs, _ = output_fsys.listdir(current_output_path)

        # Curates a dataset with metadata per process
        for i, process_output_dir in enumerate(process_output_dirs):
            # Expand the process output directory and add credentials for access
            process_output_url = '/'.join(
                [current_output_url, process_output_dir])
            process_output_url_with_cred = get_url_with_credentials(
                all_settings, process_output_url, is_relative_path=False)
            # Expand the process output file and add credentials for access
            output_file_url_with_cred = storage.get_url_with_credentials(
                all_settings,
                '/'.join([process_output_url, OUTPUT_FILE]),
                is_relative_path=False)
            try:
                output_content = storage.get_file(output_file_url_with_cred)
                val1, val2 = output_content.split()
            except (IndexError, IOError) as e:
                logger.warn(e)
                continue
            try:
                x = float(val1)
                y = float(val2)
            except (ValueError, IndexError) as e:
                logger.warn(e)
                continue

            # Returns the process id as MyTardis dataset name
            all_settings['graph_point_id'] = str(i)

            def _get_dataset_name(settings, url, path):
                return all_settings['graph_point_id']

            # Creates new dataset and adds to experiment
            # If experiment_id==0, creates new experiment
            experiment_id = mytardis.create_dataset(
                settings=all_settings,  # MyTardis credentials
                source_url=process_output_url_with_cred,
                exp_id=experiment_id,
                dataset_name=
                _get_dataset_name,  # the function that defines dataset name
                dataset_paramset=[
                    # a new blank parameter set conforming to schema 'remotemake/output'
                    mytardis.create_paramset("remotemake/output", []),
                    mytardis.create_graph_paramset(
                        "dsetgraph",  # name of schema
                        name="randdset",  # a unique dataset name
                        graph_info={},
                        value_dict={
                            "randdset/x": x,
                            "randdset/y": y
                        },  # values to be used in experiment graphs
                        value_keys=[]),
                ])
        return experiment_id
Ejemplo n.º 11
0
            #return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

        all_settings['ENCUT'] = encut
        all_settings['NUMKP'] = num_kp
        all_settings['RUNCOUNTER'] = all_settings['contextid']

        current_putput_url_cred = storage.get_url_with_credentials(all_settings,
            (current_output_url), is_relative_path=False)
        experiment_id = mytardis.create_dataset(
            settings=all_settings,
            source_url=current_putput_url_cred,
            exp_id=experiment_id,
            exp_name=_get_exp_name_for_vasp,
            dataset_name=_get_dataset_name_for_vasp,
            dataset_paramset=[
                mytardis.create_paramset("remotemake/output", []),
                mytardis.create_graph_paramset("dsetgraph",
                    name="makedset",
                    graph_info={},
                    value_dict={"makedset/num_kp": num_kp, "makedset/encut": encut, "makedset/toten": toten}
                        if (num_kp is not None)
                            and (encut is not None)
                            and (toten is not None) else {},
                    value_keys=[]
                    ),
                ]
            )

        return experiment_id

Ejemplo n.º 12
0
    def curate_data(self, run_settings, location, experiment_id):

        logger.debug("vasp durate_data")
        try:
            subdirective = getval(run_settings,
                                  '%s/stages/sweep/directive' % SCHEMA_PREFIX)
        except SettingNotFoundException:
            logger.warn("cannot find subdirective name")
            subdirective = ''

        if subdirective == "vasp":

            bdp_username = getval(
                run_settings, '%s/bdp_userprofile/username' % SCHEMA_PREFIX)
            mytardis_url = run_settings[
                'http://rmit.edu.au/schemas/input/mytardis'][
                    'mytardis_platform']
            mytardis_settings = manage.get_platform_settings(
                mytardis_url, bdp_username)
            logger.debug(mytardis_settings)

            def _get_exp_name_for_input(path):
                return str(os.sep.join(path.split(os.sep)[-2:]))

            ename = _get_exp_name_for_input(location)

            experiment_id = mytardis.create_experiment(
                settings=mytardis_settings,
                exp_id=experiment_id,
                expname=ename,
                experiment_paramset=[
                    mytardis.create_paramset("remotemake", []),
                    mytardis.create_graph_paramset(
                        "expgraph",
                        name="makeexp1",
                        graph_info={
                            "axes": ["num_kp", "energy"],
                            "legends": ["TOTEN"]
                        },
                        value_dict={},
                        value_keys=[["makedset/num_kp", "makedset/toten"]]),
                    mytardis.create_graph_paramset(
                        "expgraph",
                        name="makeexp2",
                        graph_info={
                            "axes": ["encut", "energy"],
                            "legends": ["TOTEN"]
                        },
                        value_dict={},
                        value_keys=[["makedset/encut", "makedset/toten"]]),
                    mytardis.create_graph_paramset(
                        "expgraph",
                        name="makeexp3",
                        graph_info={
                            "axes": ["num_kp", "encut", "TOTEN"],
                            "legends": ["TOTEN"]
                        },
                        value_dict={},
                        value_keys=[[
                            "makedset/num_kp", "makedset/encut",
                            "makedset/toten"
                        ]]),
                ])

        else:
            logger.warn("cannot find subdirective name")

        return experiment_id
Ejemplo n.º 13
0
class HRMCConverge(Converge):

    def input_valid(self, settings_to_test):
        """ Return a tuple, where the first element is True settings_to_test
        are syntactically and semantically valid for this stage.  Otherwise,
        return False with the second element in the tuple describing the
        problem
        """
        error = []
        try:
            int(getval(settings_to_test, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            error.append("Cannot load max_iteration")

        try:
            float(getval(settings_to_test, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            error.append("Cannot load error threshold")

        if error:
            return (False, '. '.join(error))
        return (True, "ok")

    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings):
        logger.debug("curate_dataset")
        iter_output_dir = os.path.join(os.path.join(base_dir, "output"))
        logger.debug("iter_output_dir=%s" % iter_output_dir)

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug("iter_output_dir=%s" % iter_output_dir)
        logger.debug("output_url=%s" % output_url)
        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % RMIT_SCHEMA))
        if curate_data:
            if all_settings['mytardis_host']:

#         if mytardis_settings['mytardis_host']:

#             EXP_DATASET_NAME_SPLIT = 2

#             def get_exp_name_for_output(settings, url, path):
#                 return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

#             def get_dataset_name_for_output(settings, url, path):
#                 logger.debug("path=%s" % path)

#                 host = settings['host']
#                 prefix = 'ssh://%s@%s' % (settings['type'], host)

#                 source_url = smartconnectorscheduler.get_url_with_credentials(
#                     settings, os.path.join(prefix, path, "HRMC.inp_values"),
#                     is_relative_path=False)
#                 logger.debug("source_url=%s" % source_url)
#                 try:
#                     content = storage.get_file(source_url)
#                 except IOError, e:
#                     logger.warn("cannot read file %s" % e)
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 logger.debug("content=%s" % content)
#                 try:
#                     values_map = dict(json.loads(str(content)))
#                 except Exception, e:
#                     logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 try:
#                     iteration = str(path.split(os.sep)[-2:-1][0])
#                 except Exception, e:
#                     logger.error(e)
#                     iteration = ""

#                 if "_" in iteration:
#                     iteration = iteration.split("_")[1]
#                 else:
#                     iteration = "final"

#                 dataset_name = "%s_%s_%s" % (iteration,
#                     values_map['generator_counter'],
#                     values_map['run_counter'])
#                 logger.debug("dataset_name=%s" % dataset_name)
#                 return dataset_name

#             re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')

#             logger.debug("new_output_dir=%s" % new_output_dir)
#             exp_value_keys = []
#             legends = []
#             for m, node_dir in enumerate(node_dirs):
#                 exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

#                 source_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir), is_relative_path=False)

#                 (source_scheme, source_location, source_path, source_location,
#                     query_settings) = storage.parse_bdpurl(source_url)
#                 logger.debug("source_url=%s" % source_url)
#                 legends.append(
#                     get_dataset_name_for_output(
#                         output_storage_settings, "", source_path))

#             logger.debug("exp_value_keys=%s" % exp_value_keys)
#             logger.debug("legends=%s" % legends)

#             graph_paramset = [mytardis.create_graph_paramset("expgraph",
#                 name="hrmcexp2",
#                 graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
#                 value_dict={},
#                 value_keys=exp_value_keys)]

#             for m, node_dir in enumerate(node_dirs):

#                 dataerrors_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir, DATA_ERRORS_FILE), is_relative_path=False)
#                 dataerrors_content = storage.get_file(dataerrors_url)
#                 xs = []
#                 ys = []
#                 for i, line in enumerate(dataerrors_content.splitlines()):
#                     if i == 0:
#                         continue
#                     columns = line.split()
#                     try:
#                         hrmc_step = int(columns[STEP_COLUMN_NUM])
#                     except ValueError:
#                         logger.warn("could not parse hrmc_step value on line %s" % i)
#                         continue
#                     # handle  format double precision float format
#                     val = columns[ERRGR_COLUMN_NUM]
#                     val = re_dbl_fort.sub(r'\1E\2', val)
#                     logger.debug("val=%s" % val)





                EXP_DATASET_NAME_SPLIT = 2

                def get_exp_name_for_output(settings, url, path):
                    return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings, os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    logger.debug("content=%s" % content)
                    try:
                        values_map = dict(json.loads(str(content)))
                    except Exception, e:
                        logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    try:
                        iteration = str(path.split(os.sep)[-2:-1][0])
                    except Exception, e:
                        logger.error(e)
                        iteration = ""

                    if "_" in iteration:
                        iteration = iteration.split("_")[1]
                    else:
                        iteration = "final"

                    dataset_name = "%s_%s_%s" % (iteration,
                        values_map['generator_counter'],
                        values_map['run_counter'])
                    logger.debug("dataset_name=%s" % dataset_name)
                    return dataset_name

                re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')

                exp_value_keys = []
                legends = []
                for m, node_dir in enumerate(node_output_dirnames):
                    node_path = os.path.join(iter_output_dir, node_dir)

                    exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

                    source_url = get_url_with_credentials(all_settings,
                                                   node_path, is_relative_path=False)

                    (source_scheme, source_location, source_path, source_location,
                        query_settings) = storage.parse_bdpurl(source_url)
                    logger.debug("source_url=%s" % source_url)
                    legends.append(
                        get_dataset_name_for_output(
                            all_settings, "", source_path))

                logger.debug("exp_value_keys=%s" % exp_value_keys)
                logger.debug("legends=%s" % legends)

                graph_paramset = [mytardis.create_graph_paramset("expgraph",
                    name="hrmcexp2",
                    graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
                    value_dict={},
                    value_keys=exp_value_keys)]

                for m, node_dir in enumerate(node_output_dirnames):
                    node_path = os.path.join(iter_output_dir, node_dir)
                    logger.debug("node_path=%s" % node_path)

                    #FIXME: this calculation should be done as in extract_psd_func
                    # pulling directly from data_errors rather than passing in
                    # through nested function.
                    dataerrors_url = get_url_with_credentials(all_settings,
                        os.path.join(node_path, DATA_ERRORS_FILE),
                        is_relative_path=False)
                    logger.debug("dataerrors_url=%s" % dataerrors_url)
                    dataerrors_content = storage.get_file(dataerrors_url)
                    xs = []
                    ys = []
                    for i, line in enumerate(dataerrors_content.splitlines()):
                        if i == 0:
                            continue
                        columns = line.split()
                        try:
                            hrmc_step = int(columns[STEP_COLUMN_NUM])
                        except ValueError:
                            logger.warn("could not parse hrmc_step value on line %s" % i)
                            continue
                        # handle  format double precision float format
                        val = columns[ERRGR_COLUMN_NUM]
                        val = re_dbl_fort.sub(r'\1E\2', val)
                        logger.debug("val=%s" % val)
                        try:
                            hrmc_errgr = float(val)
                        except ValueError:
                            logger.warn("could not parse hrmc_errgr value on line %s" % i)
                            continue
                        xs.append(hrmc_step)
                        ys.append(hrmc_errgr)

                    logger.debug("xs=%s" % xs)
                    logger.debug("ys=%s" % ys)

                    crit_url = get_url_with_credentials(all_settings,
                        os.path.join(node_path, "criterion.txt"), is_relative_path=False)
                    try:
                        crit = storage.get_file(crit_url)
                    except ValueError:
                        crit = None
                    except IOError:
                        crit = None
                    # FIXME: can crit be zero?
                    if crit:
                        hrmcdset_val = {"hrmcdset/it": self.id, "hrmcdset/crit": crit}
                    else:
                        hrmcdset_val = {}

                    source_url = get_url_with_credentials(
                        all_settings, node_path, is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)

                    # TODO: move into utiltiy function for reuse
                    def extract_psd_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(dataerrors_content.splitlines()):
                            if i == 0:
                                continue
                            columns = line.split()

                            val = columns[STEP_COLUMN_NUM]
                            val = re_dbl_fort.sub(r'\1E\2', val)
                            logger.debug("val=%s" % val)
                            try:
                                x = float(val)
                            except ValueError:
                                logger.warn("could not parse value on line %s" % i)
                                continue

                            val = columns[ERRGR_COLUMN_NUM]
                            val = re_dbl_fort.sub(r'\1E\2', val)
                            logger.debug("val=%s" % val)
                            try:
                                y = float(val)
                            except ValueError:
                                logger.warn("could not parse value on line %s" % i)
                                continue

                            xs.append(x)
                            ys.append(y)
                        res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
                        return res

                    def extract_psdexp_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
                        return res

                    def extract_grfinal_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        #FIXME: len(xs) == len(ys) for this to work.
                        #TODO: hack to handle when xs and ys are too
                        # large to fit in Parameter with db_index.
                        # solved by function call at destination
                        cut_xs = [xs[i] for i, x in enumerate(xs)
                            if (i % (len(xs) / 20) == 0)]
                        cut_ys = [ys[i] for i, x in enumerate(ys)
                            if (i % (len(ys) / 20) == 0)]

                        res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
                        return res

                    def extract_inputgr_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        #FIXME: len(xs) == len(ys) for this to work.
                        #TODO: hack to handle when xs and ys are too
                        # large to fit in Parameter with db_index.
                        # solved by function call at destination
                        cut_xs = [xs[i] for i, x in enumerate(xs)
                            if (i % (len(xs) / 20) == 0)]
                        cut_ys = [ys[i] for i, x in enumerate(ys)
                            if (i % (len(ys) / 20) == 0)]

                        res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
                        return res
                    #todo: replace self.boto_setttings with mytardis_settings

                    experiment_id = mytardis.create_dataset(
                        settings=all_settings,
                        source_url=source_url,
                        exp_name=get_exp_name_for_output,
                        dataset_name=get_dataset_name_for_output,
                        exp_id=experiment_id,
                        experiment_paramset=graph_paramset,
                        dataset_paramset=[
                            mytardis.create_paramset('hrmcdataset/output', []),
                            mytardis.create_graph_paramset('dsetgraph',
                                name="hrmcdset",
                                graph_info={"axes":["r (Angstroms)", "PSD"],
                                    "legends":["psd", "PSD_exp"],  "type":"line"},
                                value_dict=hrmcdset_val,
                                value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                                    ["hrmcdfile/r2", "hrmcdfile/g2"]]),
                            mytardis.create_graph_paramset('dsetgraph',
                                name='hrmcdset2',
                                graph_info={"axes":["r (Angstroms)", "g(r)"],
                                    "legends":["data_grfinal", "input_gr"],
                                    "type":"line"},
                                value_dict={},
                                value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                                    ["hrmcdfile/r4", "hrmcdfile/g4"]]),
                            mytardis.create_graph_paramset('dsetgraph',
                                name='hrmcdset%s' % m,
                                graph_info={},
                                value_dict={"hrmcdset%s/step" % m: xs,
                                    "hrmcdset%s/err" % m: ys},
                                value_keys=[]),
                            ],
                        datafile_paramset=[
                            mytardis.create_graph_paramset('dfilegraph',
                                name="hrmcdfile",
                                graph_info={},
                                value_dict={},
                                value_keys=[])
                            ],
                        dfile_extract_func={
                            'psd.dat': extract_psd_func,
                             'PSD_exp.dat': extract_psdexp_func,
                             'data_grfinal.dat': extract_grfinal_func,
                             'input_gr.dat': extract_inputgr_func}

                        )
                    graph_paramset = []
Ejemplo n.º 14
0
class HRMCTransform(Transform):
    def input_valid(self, settings_to_test):
        """ Return a tuple, where the first element is True settings_to_test
        are syntactically and semantically valid for this stage.  Otherwise,
        return False with the second element in the tuple describing the
        problem
        """
        logger.debug("settings_to_test=%s" % settings_to_test)
        error = []
        try:
            ast.literal_eval(
                getval(settings_to_test,
                       '%s/input/hrmc/threshold' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            error.append("Cannot load threshold")

        if error:
            return (False, '. '.join(error))
        return (True, "ok")

    def is_triggered(self, run_settings):
        super_trigger = super(HRMCTransform, self).is_triggered(run_settings)
        if super_trigger:
            try:
                # FIXME: need to validate this output to make sure list of int
                ast.literal_eval(
                    getval(run_settings,
                           '%s/input/hrmc/threshold' % RMIT_SCHEMA))
            except (SettingNotFoundException, ValueError):
                logger.warn("no threshold found when expected")
                return False
        return super_trigger

    def process_outputs(self, run_settings, base_dir, output_url, all_settings,
                        offset):

        # output_dir = 118.138.241.232/outptuersdfsd/sweep277/hrmc278/output_1
        # output_prefix = ssh://unix@
        # node_output_dir = 2

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir,
                                                    "output_%s" % id))
        logger.debug('iter_output_dir=%s' % iter_output_dir)
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        logger.debug('output_prefix=%s' % output_prefix)
        #iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug('output_url=%s' % output_url)
        (scheme, host, iter_output_path, location,
         query_settings) = storage.parse_bdpurl(output_url)
        logger.debug("iter_output_path=%s" % iter_output_path)
        iter_out_fsys = storage.get_filesystem(output_url)
        logger.debug("iter_out_fsys=%s" % iter_out_fsys)
        node_output_dirnames, _ = iter_out_fsys.listdir(iter_output_path)
        logger.debug('node_output_dirnames=%s' % node_output_dirnames)
        self.audit = ""

        Node_info = namedtuple('Node_info', ['dirname', 'number', 'criterion'])

        BASE_FNAME = "HRMC.inp"

        # generate criterias
        self.outputs = []
        for node_output_dirname in node_output_dirnames:
            node_path = output_prefix + os.path.join(iter_output_dir,
                                                     node_output_dirname)
            criterion = self.compute_psd_criterion(all_settings, node_path)
            #criterion = self.compute_hrmc_criterion(values_map['run_counter'], node_output_dirname, fs,)
            logger.debug("criterion=%s" % criterion)

            try:
                values_url = get_url_with_credentials(
                    all_settings,
                    os.path.join(node_path, '%s_values' % BASE_FNAME),
                    is_relative_path=False)

                values_content = storage.get_file(values_url)

                logger.debug("values_file=%s" % values_url)
            except IOError:
                logger.warn("no values file found")
                values_map = {}
            else:
                values_map = dict(json.loads(values_content))

            self.outputs.append(
                Node_info(dirname=node_output_dirname,
                          number=values_map['run_counter'],
                          criterion=criterion))

        if not self.outputs:
            logger.error("no ouput found for this iteration")
            return

        self.outputs.sort(key=lambda x: int(x.criterion))
        logger.debug("self.outputs=%s" % self.outputs)

        try:
            # FIXME: need to validate this output to make sure list of int
            threshold = ast.literal_eval(
                getval(run_settings, '%s/input/hrmc/threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            logger.warn("no threshold found when expected")
            return False
        logger.debug("threshold = %s" % threshold)
        total_picks = 1
        if len(threshold) > 1:
            for i in threshold:
                total_picks *= threshold[i]
        else:
            total_picks = threshold[0]

        def copy_files_with_pattern(iter_out_fsys, source_path, dest_path,
                                    pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                          all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' %
                         (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(
                        all_settings,
                        output_prefix +
                        os.path.join(ip_address, source_path, f),
                        is_relative_path=False)
                    dest_url = get_url_with_credentials(
                        all_settings,
                        output_prefix + os.path.join(ip_address, dest_path, f),
                        is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' %
                                 (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)

        # Make new input dirs
        new_input_dir = os.path.join(
            os.path.join(base_dir, "input_%d" % (id + 1)))
        for index in range(0, total_picks):
            Node_info = self.outputs[index]
            logger.debug("node_info.dirname=%s" % Node_info.dirname)
            logger.debug("Node_info=%s" % str(Node_info))

            new_input_path = os.path.join(new_input_dir, Node_info.dirname)
            logger.debug("New input node dir %s" % new_input_path)

            old_output_path = os.path.join(iter_output_dir, Node_info.dirname)

            # Move all existing domain input files unchanged to next input directory
            for f in DOMAIN_INPUT_FILES:
                source_url = get_url_with_credentials(
                    all_settings,
                    output_prefix + os.path.join(old_output_path, f),
                    is_relative_path=False)
                dest_url = get_url_with_credentials(
                    all_settings,
                    output_prefix + os.path.join(new_input_path, f),
                    is_relative_path=False)
                logger.debug('source_url=%s, dest_url=%s' %
                             (source_url, dest_url))

                content = storage.get_file(source_url)
                logger.debug('content collected')
                storage.put_file(dest_url, content)
                logger.debug('put successfully')

            logger.debug('put file successfully')
            pattern = "*_values"
            output_offset = os.path.join(
                os.path.join(offset, "output_%s" % id, Node_info.dirname))
            input_offset = os.path.join(
                os.path.join(offset, "input_%s" % (id + 1), Node_info.dirname))
            copy_files_with_pattern(iter_out_fsys, output_offset, input_offset,
                                    pattern, all_settings)

            pattern = "*_template"
            copy_files_with_pattern(iter_out_fsys, output_offset, input_offset,
                                    pattern, all_settings)

            # NB: Converge stage triggers based on criterion value from audit.
            logger.debug('starting audit')
            info = "Run %s preserved (error %s)\n" % (Node_info.number,
                                                      Node_info.criterion)
            audit_url = get_url_with_credentials(
                all_settings,
                output_prefix + os.path.join(new_input_path, 'audit.txt'),
                is_relative_path=False)
            storage.put_file(audit_url, info)
            logger.debug("audit=%s" % info)
            logger.debug('1:audit_url=%s' % audit_url)
            self.audit += info

            # move xyz_final.xyz to initial.xyz
            source_url = get_url_with_credentials(
                all_settings,
                output_prefix + os.path.join(old_output_path, "xyz_final.xyz"),
                is_relative_path=False)
            logger.debug('source_url=%s' % source_url)
            dest_url = get_url_with_credentials(
                all_settings,
                output_prefix +
                os.path.join(new_input_path, 'input_initial.xyz'),
                is_relative_path=False)
            logger.debug('dest_url=%s' % dest_url)
            content = storage.get_file(source_url)
            logger.debug('content=%s' % content)
            storage.put_file(dest_url, content)
            self.audit += "spawning diamond runs\n"

        logger.debug(
            "input_dir=%s" %
            (output_prefix + os.path.join(new_input_dir, 'audit.txt')))
        audit_url = get_url_with_credentials(
            all_settings,
            output_prefix + os.path.join(new_input_dir, 'audit.txt'),
            is_relative_path=False)
        logger.debug('audit_url=%s' % audit_url)
        storage.put_file(audit_url, self.audit)

    def compute_psd_criterion(self, all_settings, node_path):
        import math
        import os
        #globalFileSystem = fs.get_global_filesystem()
        # psd = os.path.join(globalFileSystem,
        #                    self.output_dir, node_output_dir,
        #                    "PSD_output/psd.dat")
        #Fixme replace all reference to files by parameters, e.g PSDCode
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        logger.debug('output_prefix=%s' % output_prefix)
        logger.debug('node_path=%s' % node_path)

        logger.debug('compute psd---')
        psd_url = get_url_with_credentials(all_settings,
                                           os.path.join(
                                               node_path, "PSD_output",
                                               "psd.dat"),
                                           is_relative_path=False)
        logger.debug('psd_url=%s' % psd_url)

        psd = storage.get_filep(psd_url)
        logger.debug('psd=%s' % psd._name)

        # psd_exp = os.path.join(globalFileSystem,
        #                        self.output_dir, node_output_dir,
        #                        "PSD_output/PSD_exp.dat")
        psd_url = get_url_with_credentials(all_settings,
                                           os.path.join(
                                               node_path, "PSD_output",
                                               "PSD_exp.dat"),
                                           is_relative_path=False)
        logger.debug('psd_url=%s' % psd_url)
        psd_exp = storage.get_filep(psd_url)
        logger.debug('psd_exp=%s' % psd_exp._name)

        logger.debug("PSD %s %s " % (psd._name, psd_exp._name))
        x_axis = []
        y1_axis = []
        for line in psd:
            column = line.split()
            #logger.debug(column)
            if len(column) > 0:
                x_axis.append(float(column[0]))
                y1_axis.append(float(column[1]))
        logger.debug("x_axis \n %s" % x_axis)
        logger.debug("y1_axis \n %s" % y1_axis)

        y2_axis = []
        for line in psd_exp:
            column = line.split()
            #logger.debug(column)
            if len(column) > 0:
                y2_axis.append(float(column[1]))

        for i in range(len(x_axis) - len(y2_axis)):
            y2_axis.append(0)
        logger.debug("y2_axis \n %s" % y2_axis)

        criterion = 0
        for i in range(len(y1_axis)):
            criterion += math.pow((y1_axis[i] - y2_axis[i]), 2)
        logger.debug("Criterion %f" % criterion)

        criterion_url = get_url_with_credentials(all_settings,
                                                 os.path.join(
                                                     node_path, "PSD_output",
                                                     "criterion.txt"),
                                                 is_relative_path=False)
        storage.put_file(criterion_url, str(criterion))

        return criterion

    def compute_hrmc_criterion(self, number, node_output_dir, fs,
                               output_storage_settings):
        output_prefix = '%s://%s@' % (output_storage_settings['scheme'],
                                      output_storage_settings['type'])
        grerr_file = 'grerr%s.dat' % str(number).zfill(2)
        logger.debug("grerr_file=%s " % grerr_file)
        grerr_url = get_url_with_credentials(
            output_storage_settings,
            output_prefix + os.path.join(self.output_dir, node_output_dir,
                                         'grerr%s.dat' % str(number).zfill(2)),
            is_relative_path=False)
        grerr_content = storage.get_file(
            grerr_url)  # FIXME: check that get_file can raise IOError
        if not grerr_content:
            logger.warn("no gerr content found")
        logger.debug("grerr_content=%s" % grerr_content)
        try:
            criterion = float(grerr_content.strip().split('\n')[-1].split()[1])
        except ValueError as e:
            logger.warn("invalid criteron found in grerr " +
                        "file for  %s/%s: %s" %
                        (self.output_dir, node_output_dir, e))
        logger.debug("criterion=%s" % criterion)
        return criterion

    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url,
                       all_settings):

        iteration = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(
            os.path.join(base_dir, "output_%s" % iteration))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, mypath, location,
         query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        if all_settings['mytardis_host']:
            for i, node_output_dirname in enumerate(node_output_dirnames):
                node_path = os.path.join(iter_output_dir, node_output_dirname)
                # find criterion
                crit = None  # is there an infinity criterion
                for ni in self.outputs:
                    if ni.dirname == node_output_dirname:
                        crit = ni.criterion
                        break
                else:
                    logger.debug("criterion not found")
                    continue
                logger.debug("crit=%s" % crit)

                # graph_params = []

                def extract_psd_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
                    return res

                def extract_psdexp_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
                    return res

                def extract_grfinal_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    #FIXME: len(xs) == len(ys) for this to work.
                    #TODO: hack to handle when xs and ys are too
                    # large to fit in Parameter with db_index.
                    # solved by function call at destination
                    cut_xs = [
                        xs[i] for i, x in enumerate(xs)
                        if (i % (len(xs) / 20) == 0)
                    ]
                    cut_ys = [
                        ys[i] for i, x in enumerate(ys)
                        if (i % (len(ys) / 20) == 0)
                    ]

                    res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
                    return res

                def extract_inputgr_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    #FIXME: len(xs) == len(ys) for this to work.
                    #TODO: hack to handle when xs and ys are too
                    # large to fit in Parameter with db_index.
                    # solved by function call at destination
                    cut_xs = [
                        xs[i] for i, x in enumerate(xs)
                        if (i % (len(xs) / 20) == 0)
                    ]
                    cut_ys = [
                        ys[i] for i, x in enumerate(ys)
                        if (i % (len(ys) / 20) == 0)
                    ]

                    res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
                    return res

                #TODO: hrmcexp graph should be tagged to input directories (not output directories)
                #because we want the result after pruning.
                #todo: replace self.boto_setttings with mytardis_settings

                EXP_DATASET_NAME_SPLIT = 2

                def get_exp_name_for_output(settings, url, path):
                    # return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))
                    return str(os.sep.join(path.split(os.sep)[-4:-2]))

                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings,
                        os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(
                            os.sep.join(
                                path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    logger.debug("content=%s" % content)
                    try:
                        values_map = dict(json.loads(str(content)))
                    except Exception, e:
                        logger.warn("cannot load %s: %s" % (content, e))
                        return str(
                            os.sep.join(
                                path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    try:
                        iteration = str(path.split(os.sep)[-2:-1][0])
                    except Exception, e:
                        logger.error(e)
                        iteration = ""

                    if "_" in iteration:
                        iteration = iteration.split("_")[1]
                    else:
                        iteration = "final"

                    dataset_name = "%s_%s_%s" % (
                        iteration, values_map['generator_counter'],
                        values_map['run_counter'])
                    logger.debug("dataset_name=%s" % dataset_name)
                    return dataset_name

                source_dir_url = get_url_with_credentials(
                    all_settings, node_path, is_relative_path=False)
                logger.debug("source_dir_url=%s" % source_dir_url)
                logger.debug('all_settings=%s' % all_settings)
                experiment_id = mytardis.create_dataset(
                    settings=all_settings,
                    source_url=source_dir_url,
                    exp_id=experiment_id,
                    exp_name=get_exp_name_for_output,
                    dataset_name=get_dataset_name_for_output,
                    dataset_paramset=[
                        mytardis.create_paramset("hrmcdataset/output", []),
                        mytardis.create_graph_paramset(
                            "dsetgraph",
                            name="hrmcdset",
                            graph_info={
                                "axes": ["r (Angstroms)", "PSD"],
                                "legends": ["psd", "PSD_exp"],
                                "type": "line"
                            },
                            value_dict={
                                "hrmcdset/it": self.id,
                                "hrmcdset/crit": crit
                            },
                            value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                                        ["hrmcdfile/r2", "hrmcdfile/g2"]]),
                        mytardis.create_graph_paramset(
                            "dsetgraph",
                            name="hrmcdset2",
                            graph_info={
                                "axes": ["r (Angstroms)", "g(r)"],
                                "legends": ["data_grfinal", "input_gr"],
                                "type": "line"
                            },
                            value_dict={},
                            value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                                        ["hrmcdfile/r4", "hrmcdfile/g4"]]),
                    ],
                    datafile_paramset=[
                        mytardis.create_graph_paramset("dfilegraph",
                                                       name="hrmcdfile",
                                                       graph_info={},
                                                       value_dict={},
                                                       value_keys=[])
                    ],
                    # TODO: move extract function into paramset structure
                    dfile_extract_func={
                        'psd.dat': extract_psd_func,
                        'PSD_exp.dat': extract_psdexp_func,
                        'data_grfinal.dat': extract_grfinal_func,
                        'input_gr.dat': extract_inputgr_func
                    })