Esempio n. 1
0
def _post_datafile(dest_url, content):
    """
       Do post to mytardis to create new datafile and any exp and dataset if
       needed
    """

    (source_scheme, tardis_host_url, source_path, source_location,
        query_settings) = storage.parse_bdpurl(dest_url)

    query_settings['mytardis_host'] = tardis_host_url

    logger.debug("query_settings=%s" % query_settings)

    exp_name = _get_value('exp_name', query_settings)
    dataset_name = _get_value('dataset_name', query_settings)
    root_path = _get_value('root_path', query_settings)
    fname = _get_value('fname', query_settings)
    tardis_user = _get_value('mytardis_username', query_settings)
    tardis_pass = _get_value('mytardis_password', query_settings)
    tardis_port = _get_value('mytardis_port', query_settings)

    exp_id, _ = _get_or_create_experiment(query_settings, exp_name)
    dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name, exp_id)

    url = "https://%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port)
    headers = {'Accept': 'application/json'}
    new_dataset_uri = "/api/v1/dataset/%s/" % dataset_id

    # import tempfile
    # temp = tempfile.NamedTemporaryFile()
    # temp.write(content)
    # temp.flush()
    # temp.seek(0)


    logger.debug("fname=%s" % fname)
    file_path = os.path.join(root_path, fname)
    logger.debug("file_path=%s" % file_path)
    #logger.debug("content=%s" % open(file_path,'rb').read())
    data = json.dumps({
        'dataset': str(new_dataset_uri),

        'filename': os.path.basename(fname),
#        'size': os.stat(temp.name).st_size,
        'size': len(content),
        'mimetype': 'text/plain',
        'md5sum': hashlib.md5(content).hexdigest()
        #'md5sum': hashlib.md5(temp.read()).hexdigest()
        })
    logger.debug("data=%s" % data)

    #temp.seek(0)

    temp = StringIO.StringIO(content)

    r = requests.post(url, data={'json_data': data}, headers=headers,
        files={'attached_file': temp},
        auth=HTTPBasicAuth(tardis_user, tardis_pass),
        verify=False
        )
Esempio n. 2
0
def _post_datafile(dest_url, content):
    """
       Do post to mytardis to create new datafile and any exp and dataset if
       needed
    """

    (source_scheme, tardis_host_url, source_path, source_location,
     query_settings) = storage.parse_bdpurl(dest_url)

    query_settings['mytardis_host'] = tardis_host_url

    logger.debug("query_settings=%s" % query_settings)

    exp_name = _get_value('exp_name', query_settings)
    dataset_name = _get_value('dataset_name', query_settings)
    root_path = _get_value('root_path', query_settings)
    fname = _get_value('fname', query_settings)
    tardis_user = _get_value('mytardis_username', query_settings)
    tardis_pass = _get_value('mytardis_password', query_settings)
    tardis_port = _get_value('mytardis_port', query_settings)

    exp_id, _ = _get_or_create_experiment(query_settings, exp_name)
    dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name,
                                           exp_id)

    url = "https://%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port)
    headers = {'Accept': 'application/json'}
    new_dataset_uri = "/api/v1/dataset/%s/" % dataset_id

    # import tempfile
    # temp = tempfile.NamedTemporaryFile()
    # temp.write(content)
    # temp.flush()
    # temp.seek(0)

    logger.debug("fname=%s" % fname)
    file_path = os.path.join(root_path, fname)
    logger.debug("file_path=%s" % file_path)
    #logger.debug("content=%s" % open(file_path,'rb').read())
    data = json.dumps({
        'dataset': str(new_dataset_uri),
        'filename': os.path.basename(fname),
        #        'size': os.stat(temp.name).st_size,
        'size': len(content),
        'mimetype': 'text/plain',
        'md5sum': hashlib.md5(content).hexdigest()
        #'md5sum': hashlib.md5(temp.read()).hexdigest()
    })
    logger.debug("data=%s" % data)

    #temp.seek(0)

    temp = StringIO.StringIO(content)

    r = requests.post(url,
                      data={'json_data': data},
                      headers=headers,
                      files={'attached_file': temp},
                      auth=HTTPBasicAuth(tardis_user, tardis_pass),
                      verify=False)
Esempio n. 3
0
    def process(self, run_settings):
        self.experiment_id = 0
        local_settings = setup_settings(run_settings)
        self.experiment_id = local_settings['experiment_id']
        messages.info(run_settings, "1: waiting for completion")
        logger.debug("settings=%s" % local_settings)

        try:
            self.runs_left = ast.literal_eval(getval(run_settings, '%s/stages/make/runs_left' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            self.runs_left = []

        # if self._exists(run_settings,
        #     'http://rmit.edu.au/schemas/stages/make',
        #     u'runs_left'):
        #     self.runs_left = ast.literal_eval(
        #         run_settings['http://rmit.edu.au/schemas/stages/make'][u'runs_left'])
        # else:
        #     self.runs_left = []

        def _get_dest_bdp_url(local_settings):
            return "%s@%s" % (
                    "nci",
                    os.path.join(local_settings['payload_destination'],
                                 str(local_settings['contextid'])))

        dest_url = _get_dest_bdp_url(local_settings)
        computation_platform_url = local_settings['comp_platform_url']
        bdp_username = local_settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url,
            bdp_username)
        local_settings.update(comp_pltf_settings)

        encoded_d_url = storage.get_url_with_credentials(
            local_settings,
            dest_url,
            is_relative_path=True,
            ip_address=local_settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)

        if self.runs_left:
            job_finished = self._job_finished(
                settings=local_settings,
                remote_path=dest_url)

            if not job_finished:
                return

            self._get_output(local_settings, dest_url)
            self.runs_left -= 1

        if self.runs_left <= 0:
            messages.success(run_settings, "%s: finished" % (1))

        logger.debug("processing finished")
Esempio n. 4
0
    def process(self, run_settings):
        self.experiment_id = 0
        local_settings = setup_settings(run_settings)
        self.experiment_id = local_settings['experiment_id']
        messages.info(run_settings, "1: waiting for completion")
        logger.debug("settings=%s" % local_settings)

        try:
            self.runs_left = ast.literal_eval(
                getval(run_settings, '%s/stages/make/runs_left' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            self.runs_left = []

        # if self._exists(run_settings,
        #     'http://rmit.edu.au/schemas/stages/make',
        #     u'runs_left'):
        #     self.runs_left = ast.literal_eval(
        #         run_settings['http://rmit.edu.au/schemas/stages/make'][u'runs_left'])
        # else:
        #     self.runs_left = []

        def _get_dest_bdp_url(local_settings):
            return "%s@%s" % ("nci",
                              os.path.join(
                                  local_settings['payload_destination'],
                                  str(local_settings['contextid'])))

        dest_url = _get_dest_bdp_url(local_settings)
        computation_platform_url = local_settings['comp_platform_url']
        bdp_username = local_settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url, bdp_username)
        local_settings.update(comp_pltf_settings)

        encoded_d_url = storage.get_url_with_credentials(
            local_settings,
            dest_url,
            is_relative_path=True,
            ip_address=local_settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)

        if self.runs_left:
            job_finished = self._job_finished(settings=local_settings,
                                              remote_path=dest_url)

            if not job_finished:
                return

            self._get_output(local_settings, dest_url)
            self.runs_left -= 1

        if self.runs_left <= 0:
            messages.success(run_settings, "%s: finished" % (1))

        logger.debug("processing finished")
Esempio n. 5
0
    def create_dataset_for_final_output(self, run_settings, experiment_id, base_dir, output_url, all_settings):
        logger.debug("curate_dataset")
        iter_output_dir = os.path.join(os.path.join(base_dir, "output"))
        logger.debug("iter_output_dir=%s" % iter_output_dir)

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug("iter_output_dir=%s" % iter_output_dir)
        logger.debug("output_url=%s" % output_url)
        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % self.SCHEMA_PREFIX))
        if curate_data:
            if all_settings['mytardis_host']:
                output_dirs = []
                for m, dir_name in enumerate(node_output_dirnames):
                    output_dirs.append(os.path.join(iter_output_dir, dir_name))

                for m, output_dir in enumerate(output_dirs):
                    #node_path = os.path.join(iter_output_dir, node_dir)
                    logger.debug("output_dir=%s" % output_dir)

                    dataset_paramset = []
                    datafile_paramset = []
                    dfile_extract_func = {}
                    self.load_metadata_builder(run_settings)
                    if self.METADATA_BUILDER:
                        (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func) = \
                        self.METADATA_BUILDER.build_metadata_for_final_output(m, output_dir, \
                        run_settings=run_settings, storage_settings=all_settings,\
                        output_dirs=output_dirs)

                    source_url = get_url_with_credentials(
                        all_settings, output_dir, is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)

                    experiment_id = mytardis.create_dataset(
                        settings=all_settings,
                        source_url=source_url,
                        exp_name=mytardis.get_exp_name_for_output,
                        dataset_name=mytardis.get_dataset_name_for_output,
                        exp_id=experiment_id,
                        experiment_paramset=experiment_paramset,
                        dataset_paramset=dataset_paramset,
                        datafile_paramset=datafile_paramset,
                        dfile_extract_func=dfile_extract_func)
                    graph_paramset = []
            else:
                logger.warn("no mytardis host specified")
        else:
            logger.warn('Data curation is off')
        return experiment_id
Esempio n. 6
0
    def create_dataset_for_intermediate_output(self, run_settings, experiment_id, base_dir, output_url,
        all_settings, outputs=[]):
        logger.debug('self_outpus_curate=%s' % outputs)
        iteration = int(getval(run_settings, '%s/system/id' % self.SCHEMA_PREFIX))
        iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % iteration))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        if all_settings['mytardis_host']:
            output_dirs = []
            for m, dir_name in enumerate(node_output_dirnames):
                output_dirs.append(os.path.join(iter_output_dir, dir_name))

            for i, output_dir in enumerate(output_dirs):
                dataset_paramset = []
                datafile_paramset = []
                dfile_extract_func = {}
                self.load_metadata_builder(run_settings)
                if self.METADATA_BUILDER:
                    (continue_loop, dataset_paramset, datafile_paramset, dfile_extract_func) = \
                    self.METADATA_BUILDER.build_metadata_for_intermediate_output(\
                    output_dir, outputs, run_settings=run_settings, storage_settings=all_settings,\
                    output_dirs=output_dirs)
                    if continue_loop:
                        continue

                source_dir_url = get_url_with_credentials(
                    all_settings,
                    output_dir,
                    is_relative_path=False)
                logger.debug("source_dir_url=%s" % source_dir_url)
                logger.debug('all_settings_here=%s' % all_settings)
                system_id = int(getval(run_settings, '%s/system/id' % self.SCHEMA_PREFIX)) #TODO Mytardis

                experiment_id = mytardis.create_dataset(
                    settings=all_settings,
                    source_url=source_dir_url,
                    exp_id=experiment_id,
                    exp_name=mytardis.get_exp_name_for_intermediate_output,
                    dataset_name=mytardis.get_dataset_name_for_output,
                    dataset_paramset=dataset_paramset,
                    datafile_paramset=datafile_paramset,
                    dfile_extract_func=dfile_extract_func
                    )
        else:
            logger.warn("no mytardis host specified")
            return 0
        return experiment_id
Esempio n. 7
0
def retrieve_datafile(url):
    """
    Retrieve contents from a mytardis datafile based on url

    NB: Has this function been tested?

    """

    (source_scheme, tardis_host_url, source_path, source_location,
     query_settings) = storage.parse_bdpurl(url)

    query_settings['mytardis_host'] = tardis_host_url

    logger.debug("query_settings=%s" % query_settings)

    exp_name = _get_value('exp_name', query_settings)
    dataset_name = _get_value('dataset_name', query_settings)
    root_path = _get_value('root_path', query_settings)
    fname = _get_value('fname', query_settings)
    tardis_user = _get_value('mytardis_username', query_settings)
    tardis_pass = _get_value('mytardis_password', query_settings)

    exp_id, _ = _get_or_create_experiment(query_settings, exp_name)
    dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name,
                                           exp_id)

    url = "http://%s/api/v1/dataset_file/%s/" % (tardis_host_url, dataset_id)
    headers = {'Accept': 'application/json'}

    logger.debug("fname=%s" % fname)
    # file_path = os.path.join(root_path, fname)
    # logger.debug("file_path=%s" % file_path)
    #logger.debug("content=%s" % open(file_path,'rb').read())
    # data = json.dumps({
    #     'dataset': str(new_dataset_uri),
    #     'filename': os.path.basename(fname),
    #     'size': os.stat(temp.name).st_size,
    #     'mimetype': 'text/plain',
    #     'md5sum': hashlib.md5(temp.read()).hexdigest()
    #     })
    # logger.debug("data=%s" % data)

    #temp.seek(0)
    r = requests.get(url,
                     headers=headers,
                     auth=HTTPBasicAuth(tardis_user, tardis_pass))
    # FIXME: need to check for status_code and handle failures.

    # logger.debug("r.js=%s" % r.json)
    # logger.debug("r.te=%s" % r.text)
    # logger.debug("r.he=%s" % r.headers)
    return r.text
Esempio n. 8
0
    def _get_output(self, local_settings, source_url):
        """
            Retrieve the output from the task on the node
        """
        logger.debug("get_output from %s" % source_url)

        computation_platform_url = local_settings['comp_platform_url']
        bdp_username = local_settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url, bdp_username)
        local_settings.update(comp_pltf_settings)

        encoded_s_url = storage.get_url_with_credentials(
            local_settings,
            source_url,
            is_relative_path=True,
            ip_address=local_settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_s_url)
        make_path = os.path.join(query_settings['root_path'], mypath)
        logger.debug("make_path=%s" % make_path)

        output_storage_url = local_settings['storeout_platform_url']
        logger.debug("output_storage_url=%s" % output_storage_url)
        output_storage_settings = manage.get_platform_settings(
            output_storage_url, bdp_username)
        local_settings.update(output_storage_settings)
        logger.debug("output_storage_settings=%s" % output_storage_settings)

        dest_url = '%s://%s@%s/%s/make%s' % (
            output_storage_settings['scheme'], output_storage_settings['type'],
            output_storage_settings['host'],
            local_settings['storeout_platform_offset'],
            str(local_settings['contextid']))

        logger.debug("Transferring output from %s to %s" %
                     (source_url, dest_url))
        local_settings.update(output_storage_settings)
        encoded_d_url = storage.get_url_with_credentials(
            local_settings, dest_url)
        logger.debug("encoded_d_url=%s" % encoded_d_url)
        # FIXME: might want to turn on paramiko compress function
        #storage_files(encoded_d_url, exceptions=[])
        # to speed up this transfer
        try:
            storage.copy_directories(encoded_s_url, encoded_d_url)
        except SSHException, e:
            logger.error(e)
            # FIXME: Could just exit, but need to flag that this data has not
            # been transferred.
            raise
Esempio n. 9
0
def retrieve_datafile(url):
    """
    Retrieve contents from a mytardis datafile based on url

    NB: Has this function been tested?

    """

    (source_scheme, tardis_host_url, source_path, source_location,
        query_settings) = storage.parse_bdpurl(url)

    query_settings['mytardis_host'] = tardis_host_url

    logger.debug("query_settings=%s" % query_settings)

    exp_name = _get_value('exp_name', query_settings)
    dataset_name = _get_value('dataset_name', query_settings)
    root_path = _get_value('root_path', query_settings)
    fname = _get_value('fname', query_settings)
    tardis_user = _get_value('mytardis_username', query_settings)
    tardis_pass = _get_value('mytardis_password', query_settings)

    exp_id, _ = _get_or_create_experiment(query_settings, exp_name)
    dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name, exp_id)

    url = "http://%s/api/v1/dataset_file/%s/" % (tardis_host_url, dataset_id)
    headers = {'Accept': 'application/json'}

    logger.debug("fname=%s" % fname)
    # file_path = os.path.join(root_path, fname)
    # logger.debug("file_path=%s" % file_path)
    #logger.debug("content=%s" % open(file_path,'rb').read())
    # data = json.dumps({
    #     'dataset': str(new_dataset_uri),
    #     'filename': os.path.basename(fname),
    #     'size': os.stat(temp.name).st_size,
    #     'mimetype': 'text/plain',
    #     'md5sum': hashlib.md5(temp.read()).hexdigest()
    #     })
    # logger.debug("data=%s" % data)

    #temp.seek(0)
    r = requests.get(url, headers=headers,
        auth=HTTPBasicAuth(tardis_user, tardis_pass)
        )
    # FIXME: need to check for status_code and handle failures.

    # logger.debug("r.js=%s" % r.json)
    # logger.debug("r.te=%s" % r.text)
    # logger.debug("r.he=%s" % r.headers)
    return r.text
Esempio n. 10
0
    def _get_output(self, local_settings, source_url):
        """
            Retrieve the output from the task on the node
        """
        logger.debug("get_output from %s" % source_url)

        computation_platform_url = local_settings['comp_platform_url']
        bdp_username = local_settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url,
            bdp_username)
        local_settings.update(comp_pltf_settings)

        encoded_s_url = storage.get_url_with_credentials(
            local_settings,
            source_url,
            is_relative_path=True,
            ip_address=local_settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_s_url)
        make_path = os.path.join(query_settings['root_path'], mypath)
        logger.debug("make_path=%s" % make_path)

        output_storage_url = local_settings['storeout_platform_url']
        logger.debug("output_storage_url=%s" % output_storage_url)
        output_storage_settings = manage.get_platform_settings(output_storage_url, bdp_username)
        local_settings.update(output_storage_settings)
        logger.debug("output_storage_settings=%s" % output_storage_settings)

        dest_url = '%s://%s@%s/%s/make%s' % (output_storage_settings['scheme'],
                output_storage_settings['type'],
                output_storage_settings['host'],
                    local_settings['storeout_platform_offset'], str(local_settings['contextid']))

        logger.debug("Transferring output from %s to %s" % (source_url,
            dest_url))
        local_settings.update(output_storage_settings)
        encoded_d_url = storage.get_url_with_credentials(local_settings, dest_url)
        logger.debug("encoded_d_url=%s" % encoded_d_url)
        # FIXME: might want to turn on paramiko compress function
        #storage_files(encoded_d_url, exceptions=[])
        # to speed up this transfer
        try:
            storage.copy_directories(encoded_s_url, encoded_d_url)
        except SSHException, e:
            logger.error(e)
            # FIXME: Could just exit, but need to flag that this data has not
            # been transferred.
            raise
Esempio n. 11
0
    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url,
        all_settings):

        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)

        logger.debug("output_url=%s" % output_url)
        output_settings = self.get_platform_settings(run_settings, RMIT_SCHEMA + '/platform/storage/output')
        current_output_url= "%s://%s@%s/%s" %(scheme, output_settings['type'], host,
            os.path.join(mypath, '1/'))
        logger.debug('current-dest=%s' % current_output_url)
        outcar_url = storage.get_url_with_credentials(
            output_settings, current_output_url +self.OUTCAR_FILE, is_relative_path=False)
        logger.debug("outcar_url=%s" % outcar_url)

        try:
            outcar_content = storage.get_file(outcar_url)
        except IOError, e:
            logger.error(e)
            toten = None
Esempio n. 12
0
    def _job_finished(self, settings, remote_path):

        encoded_d_url = storage.get_url_with_credentials(
            settings=settings,
            url_or_relative_path=remote_path,
            is_relative_path=True,
            ip_address=settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)
        stdout = ''
        stderr = ''

        try:
            ssh = open_connection(ip_address=host, settings=settings)
            (stdout, stderr) = compute.run_make(
                ssh, (os.path.join(query_settings['root_path'], mypath)),
                'running')
        except Exception, e:
            logger.error(e)
            raise
Esempio n. 13
0
    def _job_finished(self, settings, remote_path):

        encoded_d_url = storage.get_url_with_credentials(
            settings=settings,
            url_or_relative_path=remote_path,
            is_relative_path=True,
            ip_address=settings['host'])

        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)
        stdout = ''
        stderr = ''

        try:
            ssh = open_connection(ip_address=host,
                                                settings=settings)
            (stdout, stderr) = compute.run_make(ssh, (os.path.join(
                query_settings['root_path'], mypath)),
                'running')
        except Exception, e:
            logger.error(e)
            raise
Esempio n. 14
0
    def process(self, run_settings):
        settings = setup_settings(run_settings)
        messages.info(run_settings, "1: execute starting")

        def _get_dest_bdp_url(settings):
            return "%s@%s" % (
                    "nci",
                    os.path.join(settings['payload_destination'],
                                 str(settings['contextid'])))

        dest_url = _get_dest_bdp_url(settings)
        computation_platform_url = settings['comp_platform_url']
        bdp_username = settings['bdp_username']
        comp_pltf_settings = manage.get_platform_settings(
            computation_platform_url,
            bdp_username)
        logger.debug("comp_pltf_settings=%s" % pformat(comp_pltf_settings))
        settings.update(comp_pltf_settings)
        encoded_d_url = storage.get_url_with_credentials(
            settings,
            dest_url,
            is_relative_path=True,
            ip_address=settings['host'])
        (scheme, host, mypath, location, query_settings) = \
            storage.parse_bdpurl(encoded_d_url)
        stderr = ''
        try:
            ssh = open_connection(
                ip_address=settings['host'],
                settings=settings)
            (command_out, stderr) = compute.run_make(ssh, (os.path.join(
                    query_settings['root_path'],
                    mypath)), 'startrun')
        except Exception, e:
            logger.error(e)
            raise
Esempio n. 15
0
    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url,
                       all_settings):

        iteration = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(
            os.path.join(base_dir, "output_%s" % iteration))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, mypath, location,
         query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        if all_settings['mytardis_host']:
            for i, node_output_dirname in enumerate(node_output_dirnames):
                node_path = os.path.join(iter_output_dir, node_output_dirname)
                # find criterion
                crit = None  # is there an infinity criterion
                for ni in self.outputs:
                    if ni.dirname == node_output_dirname:
                        crit = ni.criterion
                        break
                else:
                    logger.debug("criterion not found")
                    continue
                logger.debug("crit=%s" % crit)

                # graph_params = []

                def extract_psd_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
                    return res

                def extract_psdexp_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
                    return res

                def extract_grfinal_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    #FIXME: len(xs) == len(ys) for this to work.
                    #TODO: hack to handle when xs and ys are too
                    # large to fit in Parameter with db_index.
                    # solved by function call at destination
                    cut_xs = [
                        xs[i] for i, x in enumerate(xs)
                        if (i % (len(xs) / 20) == 0)
                    ]
                    cut_ys = [
                        ys[i] for i, x in enumerate(ys)
                        if (i % (len(ys) / 20) == 0)
                    ]

                    res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
                    return res

                def extract_inputgr_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    #FIXME: len(xs) == len(ys) for this to work.
                    #TODO: hack to handle when xs and ys are too
                    # large to fit in Parameter with db_index.
                    # solved by function call at destination
                    cut_xs = [
                        xs[i] for i, x in enumerate(xs)
                        if (i % (len(xs) / 20) == 0)
                    ]
                    cut_ys = [
                        ys[i] for i, x in enumerate(ys)
                        if (i % (len(ys) / 20) == 0)
                    ]

                    res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
                    return res

                #TODO: hrmcexp graph should be tagged to input directories (not output directories)
                #because we want the result after pruning.
                #todo: replace self.boto_setttings with mytardis_settings

                EXP_DATASET_NAME_SPLIT = 2

                def get_exp_name_for_output(settings, url, path):
                    # return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))
                    return str(os.sep.join(path.split(os.sep)[-4:-2]))

                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings,
                        os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(
                            os.sep.join(
                                path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    logger.debug("content=%s" % content)
                    try:
                        values_map = dict(json.loads(str(content)))
                    except Exception, e:
                        logger.warn("cannot load %s: %s" % (content, e))
                        return str(
                            os.sep.join(
                                path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    try:
                        iteration = str(path.split(os.sep)[-2:-1][0])
                    except Exception, e:
                        logger.error(e)
                        iteration = ""

                    if "_" in iteration:
                        iteration = iteration.split("_")[1]
                    else:
                        iteration = "final"

                    dataset_name = "%s_%s_%s" % (
                        iteration, values_map['generator_counter'],
                        values_map['run_counter'])
                    logger.debug("dataset_name=%s" % dataset_name)
                    return dataset_name
Esempio n. 16
0
    def curate_dataset(self, run_settings, experiment_id, base_url, output_url,
                       all_settings):
        '''
            Curates dataset
        '''
        # Retrieves process directories below the current output location
        iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        current_output_url = "%s%s" % (
            output_prefix,
            os.path.join(os.path.join(base_url, "output_%s" % iteration)))
        (scheme, host, current_output_path, location,
         query_settings) = storage.parse_bdpurl(output_url)
        output_fsys = storage.get_filesystem(output_url)
        process_output_dirs, _ = output_fsys.listdir(current_output_path)

        # Curates a dataset with metadata per process
        for i, process_output_dir in enumerate(process_output_dirs):
            # Expand the process output directory and add credentials for access
            process_output_url = '/'.join(
                [current_output_url, process_output_dir])
            process_output_url_with_cred = get_url_with_credentials(
                all_settings, process_output_url, is_relative_path=False)
            # Expand the process output file and add credentials for access
            output_file_url_with_cred = storage.get_url_with_credentials(
                all_settings,
                '/'.join([process_output_url, OUTPUT_FILE]),
                is_relative_path=False)
            try:
                output_content = storage.get_file(output_file_url_with_cred)
                val1, val2 = output_content.split()
            except (IndexError, IOError) as e:
                logger.warn(e)
                continue
            try:
                x = float(val1)
                y = float(val2)
            except (ValueError, IndexError) as e:
                logger.warn(e)
                continue

            # Returns the process id as MyTardis dataset name
            all_settings['graph_point_id'] = str(i)

            def _get_dataset_name(settings, url, path):
                return all_settings['graph_point_id']

            # Creates new dataset and adds to experiment
            # If experiment_id==0, creates new experiment
            experiment_id = mytardis.create_dataset(
                settings=all_settings,  # MyTardis credentials
                source_url=process_output_url_with_cred,
                exp_id=experiment_id,
                dataset_name=
                _get_dataset_name,  # the function that defines dataset name
                dataset_paramset=[
                    # a new blank parameter set conforming to schema 'remotemake/output'
                    mytardis.create_paramset("remotemake/output", []),
                    mytardis.create_graph_paramset(
                        "dsetgraph",  # name of schema
                        name="randdset",  # a unique dataset name
                        graph_info={},
                        value_dict={
                            "randdset/x": x,
                            "randdset/y": y
                        },  # values to be used in experiment graphs
                        value_keys=[]),
                ])
        return experiment_id
Esempio n. 17
0
    def process_outputs(self, run_settings, base_dir, output_url, all_settings,
                        offset):

        # output_dir = 118.138.241.232/outptuersdfsd/sweep277/hrmc278/output_1
        # output_prefix = ssh://unix@
        # node_output_dir = 2

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir,
                                                    "output_%s" % id))
        logger.debug('iter_output_dir=%s' % iter_output_dir)
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        logger.debug('output_prefix=%s' % output_prefix)
        #iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug('output_url=%s' % output_url)
        (scheme, host, iter_output_path, location,
         query_settings) = storage.parse_bdpurl(output_url)
        logger.debug("iter_output_path=%s" % iter_output_path)
        iter_out_fsys = storage.get_filesystem(output_url)
        logger.debug("iter_out_fsys=%s" % iter_out_fsys)
        node_output_dirnames, _ = iter_out_fsys.listdir(iter_output_path)
        logger.debug('node_output_dirnames=%s' % node_output_dirnames)
        self.audit = ""

        Node_info = namedtuple('Node_info', ['dirname', 'number', 'criterion'])

        BASE_FNAME = "HRMC.inp"

        # generate criterias
        self.outputs = []
        for node_output_dirname in node_output_dirnames:
            node_path = output_prefix + os.path.join(iter_output_dir,
                                                     node_output_dirname)
            criterion = self.compute_psd_criterion(all_settings, node_path)
            #criterion = self.compute_hrmc_criterion(values_map['run_counter'], node_output_dirname, fs,)
            logger.debug("criterion=%s" % criterion)

            try:
                values_url = get_url_with_credentials(
                    all_settings,
                    os.path.join(node_path, '%s_values' % BASE_FNAME),
                    is_relative_path=False)

                values_content = storage.get_file(values_url)

                logger.debug("values_file=%s" % values_url)
            except IOError:
                logger.warn("no values file found")
                values_map = {}
            else:
                values_map = dict(json.loads(values_content))

            self.outputs.append(
                Node_info(dirname=node_output_dirname,
                          number=values_map['run_counter'],
                          criterion=criterion))

        if not self.outputs:
            logger.error("no ouput found for this iteration")
            return

        self.outputs.sort(key=lambda x: int(x.criterion))
        logger.debug("self.outputs=%s" % self.outputs)

        try:
            # FIXME: need to validate this output to make sure list of int
            threshold = ast.literal_eval(
                getval(run_settings, '%s/input/hrmc/threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            logger.warn("no threshold found when expected")
            return False
        logger.debug("threshold = %s" % threshold)
        total_picks = 1
        if len(threshold) > 1:
            for i in threshold:
                total_picks *= threshold[i]
        else:
            total_picks = threshold[0]

        def copy_files_with_pattern(iter_out_fsys, source_path, dest_path,
                                    pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                          all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' %
                         (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(
                        all_settings,
                        output_prefix +
                        os.path.join(ip_address, source_path, f),
                        is_relative_path=False)
                    dest_url = get_url_with_credentials(
                        all_settings,
                        output_prefix + os.path.join(ip_address, dest_path, f),
                        is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' %
                                 (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)

        # Make new input dirs
        new_input_dir = os.path.join(
            os.path.join(base_dir, "input_%d" % (id + 1)))
        for index in range(0, total_picks):
            Node_info = self.outputs[index]
            logger.debug("node_info.dirname=%s" % Node_info.dirname)
            logger.debug("Node_info=%s" % str(Node_info))

            new_input_path = os.path.join(new_input_dir, Node_info.dirname)
            logger.debug("New input node dir %s" % new_input_path)

            old_output_path = os.path.join(iter_output_dir, Node_info.dirname)

            # Move all existing domain input files unchanged to next input directory
            for f in DOMAIN_INPUT_FILES:
                source_url = get_url_with_credentials(
                    all_settings,
                    output_prefix + os.path.join(old_output_path, f),
                    is_relative_path=False)
                dest_url = get_url_with_credentials(
                    all_settings,
                    output_prefix + os.path.join(new_input_path, f),
                    is_relative_path=False)
                logger.debug('source_url=%s, dest_url=%s' %
                             (source_url, dest_url))

                content = storage.get_file(source_url)
                logger.debug('content collected')
                storage.put_file(dest_url, content)
                logger.debug('put successfully')

            logger.debug('put file successfully')
            pattern = "*_values"
            output_offset = os.path.join(
                os.path.join(offset, "output_%s" % id, Node_info.dirname))
            input_offset = os.path.join(
                os.path.join(offset, "input_%s" % (id + 1), Node_info.dirname))
            copy_files_with_pattern(iter_out_fsys, output_offset, input_offset,
                                    pattern, all_settings)

            pattern = "*_template"
            copy_files_with_pattern(iter_out_fsys, output_offset, input_offset,
                                    pattern, all_settings)

            # NB: Converge stage triggers based on criterion value from audit.
            logger.debug('starting audit')
            info = "Run %s preserved (error %s)\n" % (Node_info.number,
                                                      Node_info.criterion)
            audit_url = get_url_with_credentials(
                all_settings,
                output_prefix + os.path.join(new_input_path, 'audit.txt'),
                is_relative_path=False)
            storage.put_file(audit_url, info)
            logger.debug("audit=%s" % info)
            logger.debug('1:audit_url=%s' % audit_url)
            self.audit += info

            # move xyz_final.xyz to initial.xyz
            source_url = get_url_with_credentials(
                all_settings,
                output_prefix + os.path.join(old_output_path, "xyz_final.xyz"),
                is_relative_path=False)
            logger.debug('source_url=%s' % source_url)
            dest_url = get_url_with_credentials(
                all_settings,
                output_prefix +
                os.path.join(new_input_path, 'input_initial.xyz'),
                is_relative_path=False)
            logger.debug('dest_url=%s' % dest_url)
            content = storage.get_file(source_url)
            logger.debug('content=%s' % content)
            storage.put_file(dest_url, content)
            self.audit += "spawning diamond runs\n"

        logger.debug(
            "input_dir=%s" %
            (output_prefix + os.path.join(new_input_dir, 'audit.txt')))
        audit_url = get_url_with_credentials(
            all_settings,
            output_prefix + os.path.join(new_input_dir, 'audit.txt'),
            is_relative_path=False)
        logger.debug('audit_url=%s' % audit_url)
        storage.put_file(audit_url, self.audit)
Esempio n. 18
0
def create_dataset(settings,
                   source_url,
                   exp_id,
                   exp_name=_get_exp_name,
                   dataset_name=_get_dataset_name,
                   experiment_paramset=[],
                   dataset_paramset=[],
                   datafile_paramset=[],
                   dfile_extract_func=None):
    """

    Notes:
        POST to mytardis_host REST API with mytardis_user and mytardis_password
        credentials to create or update experiment for a new dataset containing
        datafiles from source_url BDP directory.

    Args:
        settings:

        source_url: url containing data to be ingested
        exp_id:
        [exp_name,dataset_name]:  functions that return new
    experiment and dataset names respectively based on url and path
        experiment_paramset: ...
        dataset_paramset: ...
        datafile_paramset:
        dfile_extract_func:


    FIXME,TODO: What if tardis in unavailable?  Connection to mytardis probably
    better handled as sperate celery subtask, which can retry until working and
    be async

    FIXME: missing all error checking and retrying of connection to mytardis.
    Reliability framework should be able to supply this?
    """

    #TODO: method should take BDP url source_url not, expanded one.

    logger.debug("post_dataset")
    tardis_user = settings["mytardis_user"]
    tardis_pass = settings["mytardis_password"]
    tardis_host_url = "http://%s" % settings["mytardis_host"]
    logger.debug("posting dataset from %s to mytardis at %s with %s" %
                 (source_url, tardis_host_url, tardis_pass))

    (source_scheme, source_location, source_path, source_location,
     query_settings) = storage.parse_bdpurl(source_url)

    logger.debug("source_path=%s" % source_path)

    if source_scheme == "file":
        root_path = _get_value('root_path', query_settings)
    else:
        logger.debug('schema=%s' % source_scheme)
        #raise InvalidInputError("only file source_schema supported for source of mytardis transfer")

    expname = exp_name(settings, source_url, source_path)
    new_exp_id = create_experiment(settings, exp_id, expname,
                                   experiment_paramset)

    new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id

    # TODO: check that we do not alreay have a dataset with
    # the same name and overwrite or don't move.
    # save dataset
    logger.debug("saving dataset in experiment at %s" % new_exp_id)
    url = "%s/api/v1/dataset/?format=json" % tardis_host_url
    headers = {'content-type': 'application/json'}

    # # FIXME: schema should be a parameter
    # schemas = [{
    #            "schema": "http://rmit.edu.au/schemas/hrmcdataset",
    #            "parameters": []
    #           }]
    # if dataset_schema:
    #    schemas.append({
    #        "schema": dataset_schema,
    #        "parameters": []
    #        })

    schemas = dataset_paramset

    logger.debug("schemas=%s" % schemas)
    data = json.dumps({
        'experiments': [new_experiment_uri],
        'description':
        dataset_name(settings, source_url, source_path),
        "parameter_sets":
        schemas
    })
    logger.debug("data=%s" % data)
    logger.debug("post to %s" % url)
    r = requests.post(url,
                      data=data,
                      headers=headers,
                      auth=HTTPBasicAuth(tardis_user, tardis_pass))
    # FIXME: need to check for status_code and handle failures.

    logger.debug("r.json=%s" % r.json)
    logger.debug("r.text=%s" % r.text)
    logger.debug("r.headers=%s" % r.headers)
    header_location = r.headers['location']
    new_dataset_uri = header_location[len(tardis_host_url):]

    # move files across
    source_files = storage.list_all_files(source_url)
    logger.debug("source_files=%s" % source_files)
    url = "%s/api/v1/dataset_file/" % tardis_host_url
    headers = {'Accept': 'application/json'}

    args = source_url.split('?')[1]

    logger.debug('args=%s' % args)
    '''
    psd_url = smartconnectorscheduler.get_url_with_credentials(output_storage_credentials,
                        'ssh://unix@' + os.path.join(self.output_dir,
                            node_output_dir, "PSD_output", "psd.dat"), is_relative_path=False)
        logger.debug('psd_url=%s' % psd_url)

        psd = hrmcstages.storage.get_filep(psd_url)
    '''
    for file_location in source_files:
        logger.debug('file_location=%s' %
                     os.path.join(source_location, file_location))
        source_file_url = "%s://%s?%s" % (
            source_scheme, os.path.join(source_location, file_location), args)
        logger.debug('source_file_url=%s' % source_file_url)
        source_file, source_file_ref = storage.get_filep(source_file_url,
                                                         sftp_reference=True)
        logger.debug('source_file=%s' % source_file._name)
        #file_path = os.path.join(root_path, file_location)
        #file_path = os.path.join(source_url, file_location)
        #logger.debug("file_path=%s" % file_path)
        #logger.debug("content=%s" % open(file_path,'rb').read())

        new_datafile_paramset = []
        logger.debug("datafile_paramset=%s" % datafile_paramset)
        for paramset in datafile_paramset:
            new_paramset = {}
            logger.debug("paramset=%s" % paramset)
            new_paramset['schema'] = paramset['schema']

            has_value = False
            has_keys = False
            new_param_vals = []
            for param in paramset['parameters']:
                new_param = {}
                for param_key, v in param.items():

                    if param_key == 'name' and v == "value_dict":
                        new_param['name'] = 'value_dict'
                        new_value = {}

                        #val = param['string_value']

                        # if not isinstance(val, basestring):
                        #     dfile_extract_func = val

                        found_func_match = False
                        for fname, func in dfile_extract_func.items():
                            logger.debug("fname=%s,func=%s" % (fname, func))
                            if fname == os.path.basename(file_location):
                                #new_value.update(func(open(file_path, 'r')))
                                source_file.seek(0)
                                new_value.update(func(source_file))

                                found_func_match = True  # FIXME: can multiple funcs match?

                        logger.debug("new_value=%s" % new_value)

                        if found_func_match:
                            new_param['string_value'] = json.dumps(new_value)
                        else:
                            new_param['string_value'] = param['string_value']
                        break
                    else:
                        # incase string_value is processed first
                        new_param[param_key] = v

                if new_param['name'] == "value_dict" and len(
                        json.loads(new_param['string_value'])):
                    has_value = True
                if new_param['name'] == "value_keys" and len(
                        json.loads(new_param['string_value'])):
                    has_keys = True
                new_param_vals.append(new_param)

            new_paramset['parameters'] = new_param_vals

            logger.debug("has_value=%s" % has_value)
            logger.debug("has_keys=%s" % has_keys)

            if has_value or has_keys:
                new_datafile_paramset.append(new_paramset)
            else:
                logger.debug("not adding %s" % new_paramset)

        logger.debug("new_datafile_paramset=%s" % new_datafile_paramset)
        logger.debug("file_namee=%s" % source_file._name)
        file_size = source_file_ref.size(source_file._name)
        logger.debug("file_size=%s" % file_size)
        if file_size > 0:
            source_file.seek(0)
            data = json.dumps({
                'dataset':
                str(new_dataset_uri),
                "parameter_sets":
                new_datafile_paramset,
                'filename':
                os.path.basename(file_location),
                #'filename': os.path.basename(file_path),
                'size':
                file_size,
                'mimetype':
                'text/plain',
                'md5sum':
                hashlib.md5(source_file.read()).hexdigest()
                #'md5sum': hashlib.md5(open(file_path, 'r').read()).hexdigest()
            })
            logger.debug("data=%s" % data)
            #import pdb; pdb.set_trace()
            source_file.seek(0)
            #logger.debug(source_file.read())
            source_file.seek(0)
            r = requests.post(
                url,
                data={'json_data': data},
                headers=headers,
                files={'attached_file':
                       source_file},  # open(file_path, 'rb')},
                auth=HTTPBasicAuth(tardis_user, tardis_pass))

            # FIXME: need to check for status_code and handle failures.

            logger.debug("r.js=%s" % r.json)
            logger.debug("r.te=%s" % r.text)
            logger.debug("r.he=%s" % r.headers)
        else:
            logger.warn("not transferring empty file %s" % file_location)
            #TODO: check whether mytardis api can accept zero length files

    return new_exp_id
Esempio n. 19
0
    def build_metadata_for_final_output(self, m, output_dir, **kwargs):
        #FIXME: this calculation should be done as in extract_psd_func
        # pulling directly from data_errors rather than passing in
        # through nested function.
        experiment_paramset = []
        dataset_paramset = []
        datafile_paramset = []
        dfile_extract_func = {}

        exp_value_keys = []
        legends = []
        for m, current_dir in enumerate(kwargs['output_dirs']):
            #node_path = os.path.join(iter_output_dir, node_dir)

            exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

            source_url = storage.get_url_with_credentials(\
            kwargs['storage_settings'], current_dir, is_relative_path=False)

            (source_scheme, source_location, source_path, source_location,
                query_settings) = storage.parse_bdpurl(source_url)
            logger.debug("source_url=%s" % source_url)
            legends.append(
                mytardis.get_dataset_name_for_output(
                    kwargs['storage_settings'], "", source_path))

        logger.debug("exp_value_keys=%s" % exp_value_keys)
        logger.debug("legends=%s" % legends)




        # for m, output_dir in enumerate(kwargs['output_dirs']):
        #node_path = os.path.join(iter_output_dir, output_dir)
        node_path = output_dir
        logger.debug("node_path=%s" % node_path)

        dataerrors_url = storage.get_url_with_credentials(kwargs['storage_settings'],
            os.path.join(node_path, self.DATA_ERRORS_FILE),
            is_relative_path=False)
        logger.debug("dataerrors_url=%s" % dataerrors_url)
        dataerrors_content = storage.get_file(dataerrors_url)
        xs = []
        ys = []
        re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')
        for i, line in enumerate(dataerrors_content.splitlines()):
            if i == 0:
                continue
            columns = line.split()
            try:
                hrmc_step = int(columns[self.STEP_COLUMN_NUM])
            except ValueError:
                logger.warn("could not parse hrmc_step value on line %s" % i)
                continue
            # handle  format double precision float format
            val = columns[self.ERRGR_COLUMN_NUM]
            val = re_dbl_fort.sub(r'\1E\2', val)
            logger.debug("val=%s" % val)
            try:
                hrmc_errgr = float(val)
            except ValueError:
                logger.warn("could not parse hrmc_errgr value on line %s" % i)
                continue
            xs.append(hrmc_step)
            ys.append(hrmc_errgr)

        logger.debug("xs=%s" % xs)
        logger.debug("ys=%s" % ys)

        crit_url = storage.get_url_with_credentials(kwargs['storage_settings'],
            os.path.join(node_path, "criterion.txt"), is_relative_path=False)
        try:
            crit = storage.get_file(crit_url)
        except ValueError:
            crit = None
        except IOError:
            crit = None
        # FIXME: can crit be zero?
        logger.debug("crit=%s" % crit)
        if crit:
            system_id = int(getval(kwargs['run_settings'], '%s/system/id' % \
            django_settings.SCHEMA_PREFIX))
            hrmcdset_val = {"hrmcdset/it": system_id, "hrmcdset/crit": crit}
        else:
            hrmcdset_val = {}

        # TODO: move into utiltiy function for reuse
        def extract_psd_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(dataerrors_content.splitlines()):
                if i == 0:
                    continue
                columns = line.split()

                val = columns[self.STEP_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    x = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                val = columns[self.ERRGR_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    y = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                xs.append(x)
                ys.append(y)
            res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
            return res

        def extract_psdexp_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
            return res

        def extract_grfinal_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [xs[i] for i, x in enumerate(xs)
                if (i % (len(xs) / 50) == 0)]
            cut_ys = [ys[i] for i, x in enumerate(ys)
                if (i % (len(ys) / 50) == 0)]

            res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
            return res

        def extract_inputgr_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [xs[i] for i, x in enumerate(xs)
                if (i % (len(xs) / 50) == 0)]
            cut_ys = [ys[i] for i, x in enumerate(ys)
                if (i % (len(ys) / 50) == 0)]

            res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
            return res
        #todo: replace self.boto_setttings with mytardis_settings


        # Only save graph paramset for experiment once per experiment.
        if not self.final_graph_paramset:
            self.final_graph_paramset = [mytardis.create_graph_paramset("expgraph",
                name="hrmcexp2",
                graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
                value_dict={},
                value_keys=exp_value_keys)]

            experiment_paramset = self.final_graph_paramset
        else:
            experiment_paramset = []

        dataset_paramset = [
            mytardis.create_paramset('hrmcdataset/output', []),
            mytardis.create_graph_paramset('dsetgraph',
                name="hrmcdset",
                graph_info={"axes":["r (Angstroms)", "PSD"],
                    "legends":["psd", "PSD_exp"],  "type":"line"},
                value_dict=hrmcdset_val,
                value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                    ["hrmcdfile/r2", "hrmcdfile/g2"]]),
            mytardis.create_graph_paramset('dsetgraph',
                name='hrmcdset2',
                graph_info={"axes":["r (Angstroms)", "g(r)"],
                    "legends":["data_grfinal", "input_gr"],
                    "type":"line"},
                value_dict={},
                value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                    ["hrmcdfile/r4", "hrmcdfile/g4"]]),
            mytardis.create_graph_paramset('dsetgraph',
                name='hrmcdset%s' % m,
                graph_info={},
                value_dict={"hrmcdset%s/step" % m: xs,
                    "hrmcdset%s/err" % m: ys},
                value_keys=[]),
            ]
        datafile_paramset = [
            mytardis.create_graph_paramset('dfilegraph',
                name="hrmcdfile",
                graph_info={},
                value_dict={},
                value_keys=[])
            ]
        dfile_extract_func = {
            'psd.dat': extract_psd_func,
            'PSD_exp.dat': extract_psdexp_func,
            'data_grfinal.dat': extract_grfinal_func,
            'input_gr.dat': extract_inputgr_func}
        logger.debug("experiment_paramset=%s" % experiment_paramset)
        logger.debug("dataset_paramset=%s" % dataset_paramset)
        logger.debug("datafile_paramset=%s" % datafile_paramset)
        logger.debug("dfile_extract_func=%s" % dfile_extract_func)

        return (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func)
Esempio n. 20
0
    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings):
        logger.debug("curate_dataset")
        iter_output_dir = os.path.join(os.path.join(base_dir, "output"))
        logger.debug("iter_output_dir=%s" % iter_output_dir)

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug("iter_output_dir=%s" % iter_output_dir)
        logger.debug("output_url=%s" % output_url)
        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % RMIT_SCHEMA))
        if curate_data:
            if all_settings['mytardis_host']:

#         if mytardis_settings['mytardis_host']:

#             EXP_DATASET_NAME_SPLIT = 2

#             def get_exp_name_for_output(settings, url, path):
#                 return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

#             def get_dataset_name_for_output(settings, url, path):
#                 logger.debug("path=%s" % path)

#                 host = settings['host']
#                 prefix = 'ssh://%s@%s' % (settings['type'], host)

#                 source_url = smartconnectorscheduler.get_url_with_credentials(
#                     settings, os.path.join(prefix, path, "HRMC.inp_values"),
#                     is_relative_path=False)
#                 logger.debug("source_url=%s" % source_url)
#                 try:
#                     content = storage.get_file(source_url)
#                 except IOError, e:
#                     logger.warn("cannot read file %s" % e)
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 logger.debug("content=%s" % content)
#                 try:
#                     values_map = dict(json.loads(str(content)))
#                 except Exception, e:
#                     logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 try:
#                     iteration = str(path.split(os.sep)[-2:-1][0])
#                 except Exception, e:
#                     logger.error(e)
#                     iteration = ""

#                 if "_" in iteration:
#                     iteration = iteration.split("_")[1]
#                 else:
#                     iteration = "final"

#                 dataset_name = "%s_%s_%s" % (iteration,
#                     values_map['generator_counter'],
#                     values_map['run_counter'])
#                 logger.debug("dataset_name=%s" % dataset_name)
#                 return dataset_name

#             re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')

#             logger.debug("new_output_dir=%s" % new_output_dir)
#             exp_value_keys = []
#             legends = []
#             for m, node_dir in enumerate(node_dirs):
#                 exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

#                 source_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir), is_relative_path=False)

#                 (source_scheme, source_location, source_path, source_location,
#                     query_settings) = storage.parse_bdpurl(source_url)
#                 logger.debug("source_url=%s" % source_url)
#                 legends.append(
#                     get_dataset_name_for_output(
#                         output_storage_settings, "", source_path))

#             logger.debug("exp_value_keys=%s" % exp_value_keys)
#             logger.debug("legends=%s" % legends)

#             graph_paramset = [mytardis.create_graph_paramset("expgraph",
#                 name="hrmcexp2",
#                 graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
#                 value_dict={},
#                 value_keys=exp_value_keys)]

#             for m, node_dir in enumerate(node_dirs):

#                 dataerrors_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir, DATA_ERRORS_FILE), is_relative_path=False)
#                 dataerrors_content = storage.get_file(dataerrors_url)
#                 xs = []
#                 ys = []
#                 for i, line in enumerate(dataerrors_content.splitlines()):
#                     if i == 0:
#                         continue
#                     columns = line.split()
#                     try:
#                         hrmc_step = int(columns[STEP_COLUMN_NUM])
#                     except ValueError:
#                         logger.warn("could not parse hrmc_step value on line %s" % i)
#                         continue
#                     # handle  format double precision float format
#                     val = columns[ERRGR_COLUMN_NUM]
#                     val = re_dbl_fort.sub(r'\1E\2', val)
#                     logger.debug("val=%s" % val)





                EXP_DATASET_NAME_SPLIT = 2

                def get_exp_name_for_output(settings, url, path):
                    return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings, os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    logger.debug("content=%s" % content)
                    try:
                        values_map = dict(json.loads(str(content)))
                    except Exception, e:
                        logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    try:
                        iteration = str(path.split(os.sep)[-2:-1][0])
                    except Exception, e:
                        logger.error(e)
                        iteration = ""

                    if "_" in iteration:
                        iteration = iteration.split("_")[1]
                    else:
                        iteration = "final"

                    dataset_name = "%s_%s_%s" % (iteration,
                        values_map['generator_counter'],
                        values_map['run_counter'])
                    logger.debug("dataset_name=%s" % dataset_name)
                    return dataset_name
Esempio n. 21
0
    def process_outputs(self, run_settings, base_dir, output_url, all_settings, offset):

        # output_dir = 118.138.241.232/outptuersdfsd/sweep277/hrmc278/output_1
        # output_prefix = ssh://unix@
        # node_output_dir = 2

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % id))
        logger.debug('iter_output_dir=%s' % iter_output_dir)
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        logger.debug('output_prefix=%s' % output_prefix)
        #iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug('output_url=%s' % output_url)
        (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(output_url)
        logger.debug("iter_output_path=%s" % iter_output_path)
        iter_out_fsys = storage.get_filesystem(output_url)
        logger.debug("iter_out_fsys=%s" % iter_out_fsys)
        node_output_dirnames, _ = iter_out_fsys.listdir(iter_output_path)
        logger.debug('node_output_dirnames=%s' % node_output_dirnames)
        self.audit = ""

        Node_info = namedtuple('Node_info',
            ['dirname', 'number', 'criterion'])

        BASE_FNAME = "HRMC.inp"

        # generate criterias
        self.outputs = []
        for node_output_dirname in node_output_dirnames:
            node_path = output_prefix + os.path.join(iter_output_dir, node_output_dirname)
            criterion = self.compute_psd_criterion(all_settings, node_path)
            #criterion = self.compute_hrmc_criterion(values_map['run_counter'], node_output_dirname, fs,)
            logger.debug("criterion=%s" % criterion)

            try:
                values_url = get_url_with_credentials(
                    all_settings, os.path.join(node_path,
                    '%s_values' % BASE_FNAME), is_relative_path=False)

                values_content = storage.get_file(values_url)

                logger.debug("values_file=%s" % values_url)
            except IOError:
                logger.warn("no values file found")
                values_map = {}
            else:
                values_map = dict(json.loads(values_content))

            self.outputs.append(Node_info(dirname=node_output_dirname,
                           number=values_map['run_counter'], criterion=criterion))

        if not self.outputs:
            logger.error("no ouput found for this iteration")
            return

        self.outputs.sort(key=lambda x: int(x.criterion))
        logger.debug("self.outputs=%s" % self.outputs)

        try:
            # FIXME: need to validate this output to make sure list of int
            threshold = ast.literal_eval(getval(run_settings, '%s/input/hrmc/threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            logger.warn("no threshold found when expected")
            return False
        logger.debug("threshold = %s" % threshold)
        total_picks = 1
        if len(threshold) > 1:
            for i in threshold:
                total_picks *= threshold[i]
        else:
            total_picks = threshold[0]

        def copy_files_with_pattern(iter_out_fsys, source_path,
                                 dest_path, pattern, all_settings):
            """
            """
            output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])

            logger.debug('source_path=%s, dest_path=%s' % (source_path, dest_path))
            # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path)
            _, node_output_fnames = iter_out_fsys.listdir(source_path)
            ip_address = all_settings['ip_address']
            for f in node_output_fnames:
                if fnmatch.fnmatch(f, pattern):
                    source_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, source_path, f), is_relative_path=False)
                    dest_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, dest_path, f), is_relative_path=False)
                    logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url))
                    content = storage.get_file(source_url)
                    storage.put_file(dest_url, content)

        # Make new input dirs
        new_input_dir = os.path.join(os.path.join(base_dir, "input_%d" % (id + 1)))
        for index in range(0, total_picks):
            Node_info = self.outputs[index]
            logger.debug("node_info.dirname=%s" % Node_info.dirname)
            logger.debug("Node_info=%s" % str(Node_info))

            new_input_path = os.path.join(new_input_dir,
                Node_info.dirname)
            logger.debug("New input node dir %s" % new_input_path)

            old_output_path = os.path.join(iter_output_dir, Node_info.dirname)

            # Move all existing domain input files unchanged to next input directory
            for f in DOMAIN_INPUT_FILES:
                source_url = get_url_with_credentials(
                    all_settings, output_prefix + os.path.join(old_output_path, f), is_relative_path=False)
                dest_url = get_url_with_credentials(
                    all_settings, output_prefix + os.path.join(new_input_path, f),
                    is_relative_path=False)
                logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url))

                content = storage.get_file(source_url)
                logger.debug('content collected')
                storage.put_file(dest_url, content)
                logger.debug('put successfully')

            logger.debug('put file successfully')
            pattern = "*_values"
            output_offset = os.path.join(os.path.join(offset, "output_%s" % id, Node_info.dirname))
            input_offset = os.path.join(os.path.join(offset, "input_%s" % (id + 1), Node_info.dirname))
            copy_files_with_pattern(iter_out_fsys,
                output_offset,
                input_offset, pattern,
                all_settings)

            pattern = "*_template"
            copy_files_with_pattern(iter_out_fsys,
                output_offset,
                input_offset, pattern,
                all_settings)

            # NB: Converge stage triggers based on criterion value from audit.
            logger.debug('starting audit')
            info = "Run %s preserved (error %s)\n" % (Node_info.number, Node_info.criterion)
            audit_url = get_url_with_credentials(
                all_settings, output_prefix +
                os.path.join(new_input_path, 'audit.txt'), is_relative_path=False)
            storage.put_file(audit_url, info)
            logger.debug("audit=%s" % info)
            logger.debug('1:audit_url=%s' % audit_url)
            self.audit += info

            # move xyz_final.xyz to initial.xyz
            source_url = get_url_with_credentials(
                all_settings, output_prefix + os.path.join(old_output_path, "xyz_final.xyz"), is_relative_path=False)
            logger.debug('source_url=%s' % source_url)
            dest_url = get_url_with_credentials(
                all_settings, output_prefix + os.path.join(new_input_path, 'input_initial.xyz'), is_relative_path=False)
            logger.debug('dest_url=%s' % dest_url)
            content = storage.get_file(source_url)
            logger.debug('content=%s' % content)
            storage.put_file(dest_url, content)
            self.audit += "spawning diamond runs\n"

        logger.debug("input_dir=%s" % (output_prefix + os.path.join(new_input_dir, 'audit.txt')))
        audit_url = get_url_with_credentials(
            all_settings, output_prefix + os.path.join(new_input_dir, 'audit.txt'), is_relative_path=False)
        logger.debug('audit_url=%s' % audit_url)
        storage.put_file(audit_url, self.audit)
Esempio n. 22
0
    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url,
        all_settings):

        iteration = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % iteration))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        if all_settings['mytardis_host']:
            for i, node_output_dirname in enumerate(node_output_dirnames):
                node_path = os.path.join(iter_output_dir, node_output_dirname)
                # find criterion
                crit = None  # is there an infinity criterion
                for ni in self.outputs:
                    if ni.dirname == node_output_dirname:
                        crit = ni.criterion
                        break
                else:
                    logger.debug("criterion not found")
                    continue
                logger.debug("crit=%s" % crit)

                # graph_params = []

                def extract_psd_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
                    return res

                def extract_psdexp_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
                    return res

                def extract_grfinal_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    #FIXME: len(xs) == len(ys) for this to work.
                    #TODO: hack to handle when xs and ys are too
                    # large to fit in Parameter with db_index.
                    # solved by function call at destination
                    cut_xs = [xs[i] for i, x in enumerate(xs)
                        if (i % (len(xs) / 20) == 0)]
                    cut_ys = [ys[i] for i, x in enumerate(ys)
                        if (i % (len(ys) / 20) == 0)]

                    res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
                    return res

                def extract_inputgr_func(fp):
                    res = []
                    xs = []
                    ys = []
                    for i, line in enumerate(fp):
                        columns = line.split()
                        xs.append(float(columns[0]))
                        ys.append(float(columns[1]))
                    #FIXME: len(xs) == len(ys) for this to work.
                    #TODO: hack to handle when xs and ys are too
                    # large to fit in Parameter with db_index.
                    # solved by function call at destination
                    cut_xs = [xs[i] for i, x in enumerate(xs)
                        if (i % (len(xs) / 20) == 0)]
                    cut_ys = [ys[i] for i, x in enumerate(ys)
                        if (i % (len(ys) / 20) == 0)]

                    res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
                    return res

                #TODO: hrmcexp graph should be tagged to input directories (not output directories)
                #because we want the result after pruning.
                #todo: replace self.boto_setttings with mytardis_settings

                EXP_DATASET_NAME_SPLIT = 2

                def get_exp_name_for_output(settings, url, path):
                    # return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))
                    return str(os.sep.join(path.split(os.sep)[-4:-2]))

                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings, os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    logger.debug("content=%s" % content)
                    try:
                        values_map = dict(json.loads(str(content)))
                    except Exception, e:
                        logger.warn("cannot load %s: %s" % (content, e))
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    try:
                        iteration = str(path.split(os.sep)[-2:-1][0])
                    except Exception, e:
                        logger.error(e)
                        iteration = ""

                    if "_" in iteration:
                        iteration = iteration.split("_")[1]
                    else:
                        iteration = "final"

                    dataset_name = "%s_%s_%s" % (iteration,
                        values_map['generator_counter'],
                        values_map['run_counter'])
                    logger.debug("dataset_name=%s" % dataset_name)
                    return dataset_name
Esempio n. 23
0
def create_dataset(settings,
                   source_url,
                   exp_id,
                   exp_name=_get_exp_name,
                   dataset_name=_get_dataset_name,
                   experiment_paramset=[],
                   dataset_paramset=[],
                   datafile_paramset=[],
                   dfile_extract_func=None):
    """


        POST to mytardis_host REST API with mytardis_user and mytardis_password
        credentials to create or update experiment for a new dataset containing
        datafiles from source_url BDP directory.

        :param dict settings.keys(): ['mytardis_user', 'mytardis_password', 'mytardis_host']
        :param str source_url: chiminey URL for the source of dataset
        :param int exp_id: unique experiment id for existing experiment or 0 for new
        :param func exp_name: function that returns experiment name based on url and path
        :param func dataset_name: function that returns dataset name based on url and path
        :param paramset dataset_param: metadata package for dataset
        :param paramset datafile_paramset: metadata package for datafiles
        :param func dfile_extract_func: function that extracts datafile information
        :return: new mytardis experiment id
        :rtype: int
        :raises: IndexError if setttings does not contain required configuration fields or is otherwise invalid.

        If exp_id is non-zero, adds to existing experiment with exp_id, else new created
        identifier returned.  experiment_paramset is appended to any existing
        metadata and does not overwrite.

    """
    #FIXME,TODO: What if tardis in unavailable?  Connection to mytardis probably
    #better handled as sperate celery subtask, which can retry until working and
    #be async

    #FIXME: missing all error checking and retrying of connection to mytardis.
    #Reliability framework should be able to supply this?

    #TODO: method should take BDP url source_url not, expanded one.

    logger.debug("post_dataset")
    tardis_user = settings["mytardis_user"]
    tardis_pass = settings["mytardis_password"]
    tardis_ssh = int(settings["mytardis_ssl"])
    tardis_protocol = "http://%s"
    if tardis_ssh > 0:
        tardis_protocol = "https://%s"
    tardis_host_url = tardis_protocol % settings["mytardis_host"]
    tardis_port = settings["mytardis_port"]
    logger.debug("posting dataset from %s to mytardis at %s with %s" %
                 (source_url, tardis_host_url, tardis_pass))

    (source_scheme, source_location, source_path, source_location,
     query_settings) = storage.parse_bdpurl(source_url)

    logger.debug("source_path=%s" % source_path)

    if source_scheme == "file":
        root_path = _get_value('root_path', query_settings)
    else:
        logger.debug('schema=%s' % source_scheme)
        #raise InvalidInputError("only file source_schema supported for source of mytardis transfer")

    expname = exp_name(settings, source_url, source_path)
    new_exp_id = create_experiment(settings, exp_id, expname,
                                   experiment_paramset)

    new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id

    # TODO: check that we do not alreay have a dataset with
    # the same name and overwrite or don't move.
    # save dataset
    logger.debug("saving dataset in experiment at %s" % new_exp_id)
    url = "%s:%s/api/v1/dataset/?format=json" % (tardis_host_url, tardis_port)
    headers = {'content-type': 'application/json'}

    schemas = dataset_paramset

    logger.debug("schemas=%s" % schemas)
    data = json.dumps({
        'experiments': [new_experiment_uri],
        'description':
        dataset_name(settings, source_url, source_path),
        "parameter_sets":
        schemas
    })
    logger.debug("data=%s" % data)
    logger.debug("post to %s" % url)
    r = requests.post(url,
                      data=data,
                      headers=headers,
                      auth=HTTPBasicAuth(tardis_user, tardis_pass),
                      verify=False)
    # FIXME: need to check for status_code and handle failures.

    logger.debug("r.json=%s" % r.json)
    logger.debug("r.text=%s" % r.text)
    logger.debug("r.headers=%s" % r.headers)
    header_location = r.headers['location']
    new_dataset_uri = header_location[len(tardis_host_url):]

    # move files across
    source_files = storage.list_all_files(source_url)
    logger.debug("source_files=%s" % source_files)
    url = "%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port)

    args = source_url.split('?')[1]

    logger.debug('args=%s' % args)

    staging_dir = tempfile.mkdtemp(suffix="", prefix="chiminey")
    try:
        for fname in source_files:
            logger.debug('fname=%s' % os.path.join(source_location, fname))

            source_file_url = "%s://%s?%s" % (
                source_scheme, os.path.join(source_location, fname), args)
            logger.debug('source_file_url=%s' % source_file_url)

            # TODO: add retrying to this operation.
            source_file = storage.get_filep(source_file_url,
                                            sftp_reference=False)

            #logger.debug('source_file=%s' % source_file._name)

            # we have load contents locally at least once.
            f_contents = source_file.read()

            # Make temporary copy as mytardis datafile pos requires filename
            tempfname = os.path.basename(fname)
            with open(os.path.join(staging_dir, tempfname), 'wb') as fp:
                fp.write(f_contents)

            new_datafile_paramset = []
            logger.debug("datafile_paramset=%s" % datafile_paramset)

            for paramset in datafile_paramset:
                new_paramset = {}
                logger.debug("paramset=%s" % paramset)
                new_paramset['schema'] = paramset['schema']

                has_value = False
                has_keys = False
                new_param_vals = []

                for param in paramset['parameters']:
                    new_param = {}

                    for param_key, v in param.iteritems():
                        logger.debug("param_key=%s v=%s" % (param_key, v))
                        if param_key == 'name' and v == "value_dict":
                            new_param['name'] = 'value_dict'
                            new_value = {}

                            found_func_match = False
                            for fn, func in dfile_extract_func.iteritems():
                                logger.debug("fn=%s,func=%s" % (fn, func))
                                if fn == os.path.basename(fname):
                                    # if fn file is very long, this is inefficient
                                    logger.debug("fname=%s" %
                                                 os.path.join(staging_dir, fn))
                                    with open(os.path.join(staging_dir, fn),
                                              'r') as fp:
                                        new_value.update(func(fp))
                                    found_func_match = True  # FIXME: can multiple funcs match?
                                    logger.debug("matched %s %s" % (fn, func))

                            logger.debug("new_value=%s" % new_value)

                            new_param['string_value'] = json.dumps(
                                new_value
                            ) if found_func_match else param['string_value']

                            break
                        else:
                            # incase string_value is processed first
                            new_param[param_key] = v

                    logger.debug("string_value len=%s" %
                                 new_param['string_value'])

                    if new_param['name'] == "value_dict" and len(
                            json.loads(new_param['string_value'])):
                        has_value = True
                    logger.debug("has_value=%s" % has_value)

                    if new_param['name'] == "value_keys" and len(
                            json.loads(new_param['string_value'])):
                        has_keys = True
                    logger.debug("has_keys=%s" % has_keys)

                    new_param_vals.append(new_param)

                new_paramset['parameters'] = new_param_vals

                logger.debug("has_value=%s" % has_value)
                logger.debug("has_keys=%s" % has_keys)

                if has_value or has_keys:
                    new_datafile_paramset.append(new_paramset)
                else:
                    logger.debug("not adding %s" % new_paramset)

            logger.debug("new_datafile_paramset=%s" % new_datafile_paramset)
            file_size = len(f_contents)
            logger.debug("file_size=%s" % file_size)
            if file_size:

                data = json.dumps({
                    u'dataset':
                    str(new_dataset_uri),
                    u'parameter_sets':
                    new_datafile_paramset,
                    u'filename':
                    os.path.basename(fname),
                    u'size':
                    file_size,
                    u'mimetype':
                    'text/plain',
                    u'md5sum':
                    hashlib.md5(f_contents).hexdigest()
                })
                logger.debug("data=%s" % data)

                with open(os.path.join(staging_dir, tempfname), 'rb') as fp:

                    r = requests.post(url,
                                      data={"json_data": data},
                                      headers={'Accept': 'application/json'},
                                      files={'attached_file': fp},
                                      auth=HTTPBasicAuth(
                                          tardis_user, tardis_pass),
                                      verify=False)

                    # FIXME: need to check for status_code and handle failures.
                    logger.debug("r.js=%s" % r.json)
                    logger.debug("r.te=%s" % r.text)
                    logger.debug("r.he=%s" % r.headers)

            else:
                logger.warn("not transferring empty file %s" % fname)
                #TODO: check whether mytardis api can accept zero length files

    finally:
        shutil.rmtree(staging_dir)

    return new_exp_id
Esempio n. 24
0
class HRMCConverge(Converge):

    def input_valid(self, settings_to_test):
        """ Return a tuple, where the first element is True settings_to_test
        are syntactically and semantically valid for this stage.  Otherwise,
        return False with the second element in the tuple describing the
        problem
        """
        error = []
        try:
            int(getval(settings_to_test, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            error.append("Cannot load max_iteration")

        try:
            float(getval(settings_to_test, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            error.append("Cannot load error threshold")

        if error:
            return (False, '. '.join(error))
        return (True, "ok")

    def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings):
        logger.debug("curate_dataset")
        iter_output_dir = os.path.join(os.path.join(base_dir, "output"))
        logger.debug("iter_output_dir=%s" % iter_output_dir)

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug("iter_output_dir=%s" % iter_output_dir)
        logger.debug("output_url=%s" % output_url)
        (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % RMIT_SCHEMA))
        if curate_data:
            if all_settings['mytardis_host']:

#         if mytardis_settings['mytardis_host']:

#             EXP_DATASET_NAME_SPLIT = 2

#             def get_exp_name_for_output(settings, url, path):
#                 return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

#             def get_dataset_name_for_output(settings, url, path):
#                 logger.debug("path=%s" % path)

#                 host = settings['host']
#                 prefix = 'ssh://%s@%s' % (settings['type'], host)

#                 source_url = smartconnectorscheduler.get_url_with_credentials(
#                     settings, os.path.join(prefix, path, "HRMC.inp_values"),
#                     is_relative_path=False)
#                 logger.debug("source_url=%s" % source_url)
#                 try:
#                     content = storage.get_file(source_url)
#                 except IOError, e:
#                     logger.warn("cannot read file %s" % e)
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 logger.debug("content=%s" % content)
#                 try:
#                     values_map = dict(json.loads(str(content)))
#                 except Exception, e:
#                     logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
#                     return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

#                 try:
#                     iteration = str(path.split(os.sep)[-2:-1][0])
#                 except Exception, e:
#                     logger.error(e)
#                     iteration = ""

#                 if "_" in iteration:
#                     iteration = iteration.split("_")[1]
#                 else:
#                     iteration = "final"

#                 dataset_name = "%s_%s_%s" % (iteration,
#                     values_map['generator_counter'],
#                     values_map['run_counter'])
#                 logger.debug("dataset_name=%s" % dataset_name)
#                 return dataset_name

#             re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')

#             logger.debug("new_output_dir=%s" % new_output_dir)
#             exp_value_keys = []
#             legends = []
#             for m, node_dir in enumerate(node_dirs):
#                 exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

#                 source_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir), is_relative_path=False)

#                 (source_scheme, source_location, source_path, source_location,
#                     query_settings) = storage.parse_bdpurl(source_url)
#                 logger.debug("source_url=%s" % source_url)
#                 legends.append(
#                     get_dataset_name_for_output(
#                         output_storage_settings, "", source_path))

#             logger.debug("exp_value_keys=%s" % exp_value_keys)
#             logger.debug("legends=%s" % legends)

#             graph_paramset = [mytardis.create_graph_paramset("expgraph",
#                 name="hrmcexp2",
#                 graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
#                 value_dict={},
#                 value_keys=exp_value_keys)]

#             for m, node_dir in enumerate(node_dirs):

#                 dataerrors_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings,
#                     output_prefix + os.path.join(new_output_dir, node_dir, DATA_ERRORS_FILE), is_relative_path=False)
#                 dataerrors_content = storage.get_file(dataerrors_url)
#                 xs = []
#                 ys = []
#                 for i, line in enumerate(dataerrors_content.splitlines()):
#                     if i == 0:
#                         continue
#                     columns = line.split()
#                     try:
#                         hrmc_step = int(columns[STEP_COLUMN_NUM])
#                     except ValueError:
#                         logger.warn("could not parse hrmc_step value on line %s" % i)
#                         continue
#                     # handle  format double precision float format
#                     val = columns[ERRGR_COLUMN_NUM]
#                     val = re_dbl_fort.sub(r'\1E\2', val)
#                     logger.debug("val=%s" % val)





                EXP_DATASET_NAME_SPLIT = 2

                def get_exp_name_for_output(settings, url, path):
                    return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT]))

                def get_dataset_name_for_output(settings, url, path):
                    logger.debug("path=%s" % path)

                    host = settings['host']
                    prefix = 'ssh://%s@%s' % (settings['type'], host)

                    source_url = get_url_with_credentials(
                        settings, os.path.join(prefix, path, "HRMC.inp_values"),
                        is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)
                    try:
                        content = storage.get_file(source_url)
                    except IOError, e:
                        logger.warn("cannot read file %s" % e)
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    logger.debug("content=%s" % content)
                    try:
                        values_map = dict(json.loads(str(content)))
                    except Exception, e:
                        logger.error("cannot load values_map %s: from %s.  Error=%s" % (content, source_url, e))
                        return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))

                    try:
                        iteration = str(path.split(os.sep)[-2:-1][0])
                    except Exception, e:
                        logger.error(e)
                        iteration = ""

                    if "_" in iteration:
                        iteration = iteration.split("_")[1]
                    else:
                        iteration = "final"

                    dataset_name = "%s_%s_%s" % (iteration,
                        values_map['generator_counter'],
                        values_map['run_counter'])
                    logger.debug("dataset_name=%s" % dataset_name)
                    return dataset_name

                re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')

                exp_value_keys = []
                legends = []
                for m, node_dir in enumerate(node_output_dirnames):
                    node_path = os.path.join(iter_output_dir, node_dir)

                    exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m])

                    source_url = get_url_with_credentials(all_settings,
                                                   node_path, is_relative_path=False)

                    (source_scheme, source_location, source_path, source_location,
                        query_settings) = storage.parse_bdpurl(source_url)
                    logger.debug("source_url=%s" % source_url)
                    legends.append(
                        get_dataset_name_for_output(
                            all_settings, "", source_path))

                logger.debug("exp_value_keys=%s" % exp_value_keys)
                logger.debug("legends=%s" % legends)

                graph_paramset = [mytardis.create_graph_paramset("expgraph",
                    name="hrmcexp2",
                    graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends},
                    value_dict={},
                    value_keys=exp_value_keys)]

                for m, node_dir in enumerate(node_output_dirnames):
                    node_path = os.path.join(iter_output_dir, node_dir)
                    logger.debug("node_path=%s" % node_path)

                    #FIXME: this calculation should be done as in extract_psd_func
                    # pulling directly from data_errors rather than passing in
                    # through nested function.
                    dataerrors_url = get_url_with_credentials(all_settings,
                        os.path.join(node_path, DATA_ERRORS_FILE),
                        is_relative_path=False)
                    logger.debug("dataerrors_url=%s" % dataerrors_url)
                    dataerrors_content = storage.get_file(dataerrors_url)
                    xs = []
                    ys = []
                    for i, line in enumerate(dataerrors_content.splitlines()):
                        if i == 0:
                            continue
                        columns = line.split()
                        try:
                            hrmc_step = int(columns[STEP_COLUMN_NUM])
                        except ValueError:
                            logger.warn("could not parse hrmc_step value on line %s" % i)
                            continue
                        # handle  format double precision float format
                        val = columns[ERRGR_COLUMN_NUM]
                        val = re_dbl_fort.sub(r'\1E\2', val)
                        logger.debug("val=%s" % val)
                        try:
                            hrmc_errgr = float(val)
                        except ValueError:
                            logger.warn("could not parse hrmc_errgr value on line %s" % i)
                            continue
                        xs.append(hrmc_step)
                        ys.append(hrmc_errgr)

                    logger.debug("xs=%s" % xs)
                    logger.debug("ys=%s" % ys)

                    crit_url = get_url_with_credentials(all_settings,
                        os.path.join(node_path, "criterion.txt"), is_relative_path=False)
                    try:
                        crit = storage.get_file(crit_url)
                    except ValueError:
                        crit = None
                    except IOError:
                        crit = None
                    # FIXME: can crit be zero?
                    if crit:
                        hrmcdset_val = {"hrmcdset/it": self.id, "hrmcdset/crit": crit}
                    else:
                        hrmcdset_val = {}

                    source_url = get_url_with_credentials(
                        all_settings, node_path, is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)

                    # TODO: move into utiltiy function for reuse
                    def extract_psd_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(dataerrors_content.splitlines()):
                            if i == 0:
                                continue
                            columns = line.split()

                            val = columns[STEP_COLUMN_NUM]
                            val = re_dbl_fort.sub(r'\1E\2', val)
                            logger.debug("val=%s" % val)
                            try:
                                x = float(val)
                            except ValueError:
                                logger.warn("could not parse value on line %s" % i)
                                continue

                            val = columns[ERRGR_COLUMN_NUM]
                            val = re_dbl_fort.sub(r'\1E\2', val)
                            logger.debug("val=%s" % val)
                            try:
                                y = float(val)
                            except ValueError:
                                logger.warn("could not parse value on line %s" % i)
                                continue

                            xs.append(x)
                            ys.append(y)
                        res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
                        return res

                    def extract_psdexp_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
                        return res

                    def extract_grfinal_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        #FIXME: len(xs) == len(ys) for this to work.
                        #TODO: hack to handle when xs and ys are too
                        # large to fit in Parameter with db_index.
                        # solved by function call at destination
                        cut_xs = [xs[i] for i, x in enumerate(xs)
                            if (i % (len(xs) / 20) == 0)]
                        cut_ys = [ys[i] for i, x in enumerate(ys)
                            if (i % (len(ys) / 20) == 0)]

                        res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
                        return res

                    def extract_inputgr_func(fp):
                        res = []
                        xs = []
                        ys = []
                        for i, line in enumerate(fp):
                            columns = line.split()
                            xs.append(float(columns[0]))
                            ys.append(float(columns[1]))
                        #FIXME: len(xs) == len(ys) for this to work.
                        #TODO: hack to handle when xs and ys are too
                        # large to fit in Parameter with db_index.
                        # solved by function call at destination
                        cut_xs = [xs[i] for i, x in enumerate(xs)
                            if (i % (len(xs) / 20) == 0)]
                        cut_ys = [ys[i] for i, x in enumerate(ys)
                            if (i % (len(ys) / 20) == 0)]

                        res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
                        return res
                    #todo: replace self.boto_setttings with mytardis_settings

                    experiment_id = mytardis.create_dataset(
                        settings=all_settings,
                        source_url=source_url,
                        exp_name=get_exp_name_for_output,
                        dataset_name=get_dataset_name_for_output,
                        exp_id=experiment_id,
                        experiment_paramset=graph_paramset,
                        dataset_paramset=[
                            mytardis.create_paramset('hrmcdataset/output', []),
                            mytardis.create_graph_paramset('dsetgraph',
                                name="hrmcdset",
                                graph_info={"axes":["r (Angstroms)", "PSD"],
                                    "legends":["psd", "PSD_exp"],  "type":"line"},
                                value_dict=hrmcdset_val,
                                value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                                    ["hrmcdfile/r2", "hrmcdfile/g2"]]),
                            mytardis.create_graph_paramset('dsetgraph',
                                name='hrmcdset2',
                                graph_info={"axes":["r (Angstroms)", "g(r)"],
                                    "legends":["data_grfinal", "input_gr"],
                                    "type":"line"},
                                value_dict={},
                                value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                                    ["hrmcdfile/r4", "hrmcdfile/g4"]]),
                            mytardis.create_graph_paramset('dsetgraph',
                                name='hrmcdset%s' % m,
                                graph_info={},
                                value_dict={"hrmcdset%s/step" % m: xs,
                                    "hrmcdset%s/err" % m: ys},
                                value_keys=[]),
                            ],
                        datafile_paramset=[
                            mytardis.create_graph_paramset('dfilegraph',
                                name="hrmcdfile",
                                graph_info={},
                                value_dict={},
                                value_keys=[])
                            ],
                        dfile_extract_func={
                            'psd.dat': extract_psd_func,
                             'PSD_exp.dat': extract_psdexp_func,
                             'data_grfinal.dat': extract_grfinal_func,
                             'input_gr.dat': extract_inputgr_func}

                        )
                    graph_paramset = []
Esempio n. 25
0
    def curate_dataset(self, run_settings, experiment_id,
                       base_url, output_url, all_settings):
        '''
            Curates dataset
        '''
        # Retrieves process directories below the current output location
        iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        current_output_url = "%s%s" % (output_prefix, os.path.join(os.path.join(
            base_url, "output_%s" % iteration)))
        (scheme, host, current_output_path, location, query_settings) = storage.parse_bdpurl(output_url)
        output_fsys = storage.get_filesystem(output_url)
        process_output_dirs, _ = output_fsys.listdir(current_output_path)

        # Curates a dataset with metadata per process
        for i, process_output_dir in enumerate(process_output_dirs):
            # Expand the process output directory and add credentials for access
            process_output_url = '/'.join([current_output_url, process_output_dir])
            process_output_url_with_cred = get_url_with_credentials(
                    all_settings,
                    process_output_url,
                    is_relative_path=False)
            # Expand the process output file and add credentials for access
            output_file_url_with_cred = storage.get_url_with_credentials(
                all_settings, '/'.join([process_output_url, OUTPUT_FILE]),
                is_relative_path=False)
            try:
                output_content = storage.get_file(output_file_url_with_cred)
                val1, val2 = output_content.split()
            except (IndexError, IOError) as e:
                logger.warn(e)
                continue
            try:
                x = float(val1)
                y = float(val2)
            except (ValueError, IndexError) as e:
                logger.warn(e)
                continue

            # Returns the process id as MyTardis dataset name
            all_settings['graph_point_id'] = str(i)
            def _get_dataset_name(settings, url, path):
                return all_settings['graph_point_id']

            # Creates new dataset and adds to experiment
            # If experiment_id==0, creates new experiment
            experiment_id = mytardis.create_dataset(
                settings=all_settings, # MyTardis credentials
                source_url=process_output_url_with_cred,
                exp_id=experiment_id,
                dataset_name=_get_dataset_name, # the function that defines dataset name
                dataset_paramset=[
                    # a new blank parameter set conforming to schema 'remotemake/output'
                    mytardis.create_paramset("remotemake/output", []),
                    mytardis.create_graph_paramset("dsetgraph", # name of schema
                        name="randdset", # a unique dataset name
                        graph_info={},
                        value_dict={"randdset/x": x, "randdset/y": y},  # values to be used in experiment graphs
                        value_keys=[]
                        ),
                    ]
                )
        return experiment_id
Esempio n. 26
0
    def create_dataset_for_intermediate_output(self,
                                               run_settings,
                                               experiment_id,
                                               base_dir,
                                               output_url,
                                               all_settings,
                                               outputs=[]):
        logger.debug('self_outpus_curate=%s' % outputs)
        iteration = int(
            getval(run_settings, '%s/system/id' % self.SCHEMA_PREFIX))
        iter_output_dir = os.path.join(
            os.path.join(base_dir, "output_%s" % iteration))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, mypath, location,
         query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        if all_settings['mytardis_host']:
            output_dirs = []
            for m, dir_name in enumerate(node_output_dirnames):
                output_dirs.append(os.path.join(iter_output_dir, dir_name))

            for i, output_dir in enumerate(output_dirs):
                dataset_paramset = []
                datafile_paramset = []
                dfile_extract_func = {}
                self.load_metadata_builder(run_settings)
                if self.METADATA_BUILDER:
                    (continue_loop, dataset_paramset, datafile_paramset, dfile_extract_func) = \
                    self.METADATA_BUILDER.build_metadata_for_intermediate_output(\
                    output_dir, outputs, run_settings=run_settings, storage_settings=all_settings,\
                    output_dirs=output_dirs)
                    if continue_loop:
                        continue

                source_dir_url = get_url_with_credentials(
                    all_settings, output_dir, is_relative_path=False)
                logger.debug("source_dir_url=%s" % source_dir_url)
                logger.debug('all_settings_here=%s' % all_settings)
                system_id = int(
                    getval(run_settings, '%s/system/id' %
                           self.SCHEMA_PREFIX))  #TODO Mytardis

                experiment_id = mytardis.create_dataset(
                    settings=all_settings,
                    source_url=source_dir_url,
                    exp_id=experiment_id,
                    exp_name=mytardis.get_exp_name_for_intermediate_output,
                    dataset_name=mytardis.get_dataset_name_for_output,
                    dataset_paramset=dataset_paramset,
                    datafile_paramset=datafile_paramset,
                    dfile_extract_func=dfile_extract_func)
        else:
            logger.warn("no mytardis host specified")
            return 0
        return experiment_id
Esempio n. 27
0
def create_dataset(settings,
        source_url,
        exp_id,
        exp_name=_get_exp_name,
        dataset_name=_get_dataset_name,
        experiment_paramset=[],
        dataset_paramset=[],
        datafile_paramset=[],
        dfile_extract_func=None):

    """


        POST to mytardis_host REST API with mytardis_user and mytardis_password
        credentials to create or update experiment for a new dataset containing
        datafiles from source_url BDP directory.

        :param dict settings.keys(): ['mytardis_user', 'mytardis_password', 'mytardis_host']
        :param str source_url: chiminey URL for the source of dataset
        :param int exp_id: unique experiment id for existing experiment or 0 for new
        :param func exp_name: function that returns experiment name based on url and path
        :param func dataset_name: function that returns dataset name based on url and path
        :param paramset dataset_param: metadata package for dataset
        :param paramset datafile_paramset: metadata package for datafiles
        :param func dfile_extract_func: function that extracts datafile information
        :return: new mytardis experiment id
        :rtype: int
        :raises: IndexError if setttings does not contain required configuration fields or is otherwise invalid.

        If exp_id is non-zero, adds to existing experiment with exp_id, else new created
        identifier returned.  experiment_paramset is appended to any existing
        metadata and does not overwrite.

    """
    #FIXME,TODO: What if tardis in unavailable?  Connection to mytardis probably
    #better handled as sperate celery subtask, which can retry until working and
    #be async

    #FIXME: missing all error checking and retrying of connection to mytardis.
    #Reliability framework should be able to supply this?

    #TODO: method should take BDP url source_url not, expanded one.

    logger.debug("post_dataset")
    tardis_user = settings["mytardis_user"]
    tardis_pass = settings["mytardis_password"]
    tardis_ssh = int(settings["mytardis_ssl"])
    tardis_protocol = "http://%s"
    if tardis_ssh > 0:
        tardis_protocol = "https://%s"
    tardis_host_url = tardis_protocol % settings["mytardis_host"]
    tardis_port = settings["mytardis_port"]
    logger.debug("posting dataset from %s to mytardis at %s with %s" % (source_url,
        tardis_host_url, tardis_pass))

    (source_scheme, source_location, source_path, source_location,
        query_settings) = storage.parse_bdpurl(source_url)

    logger.debug("source_path=%s" % source_path)

    if source_scheme == "file":
        root_path = _get_value('root_path', query_settings)
    else:
        logger.debug('schema=%s' % source_scheme)
        #raise InvalidInputError("only file source_schema supported for source of mytardis transfer")

    expname = exp_name(settings, source_url, source_path)
    new_exp_id = create_experiment(settings, exp_id, expname, experiment_paramset)

    new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id

    # TODO: check that we do not alreay have a dataset with
    # the same name and overwrite or don't move.
    # save dataset
    logger.debug("saving dataset in experiment at %s" % new_exp_id)
    url = "%s:%s/api/v1/dataset/?format=json" % (tardis_host_url, tardis_port)
    headers = {'content-type': 'application/json'}

    schemas = dataset_paramset

    logger.debug("schemas=%s" % schemas)
    data = json.dumps({
       'experiments': [new_experiment_uri],
       'description': dataset_name(settings, source_url, source_path),
       "parameter_sets": schemas
           })
    logger.debug("data=%s" % data)
    logger.debug("post to %s" % url)
    r = requests.post(url, data=data, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass), verify=False)
    # FIXME: need to check for status_code and handle failures.

    logger.debug("r.json=%s" % r.json)
    logger.debug("r.text=%s" % r.text)
    logger.debug("r.headers=%s" % r.headers)
    header_location = r.headers['location']
    new_dataset_uri = header_location[len(tardis_host_url):]

    # move files across
    source_files = storage.list_all_files(source_url)
    logger.debug("source_files=%s" % source_files)
    url = "%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port)

    args = source_url.split('?')[1]

    logger.debug('args=%s' % args)

    staging_dir = tempfile.mkdtemp(suffix="", prefix="chiminey")
    try:
        for fname in source_files:
            logger.debug('fname=%s'
                         % os.path.join(source_location, fname))

            source_file_url = "%s://%s?%s" % (
                source_scheme, os.path.join(source_location, fname), args)
            logger.debug('source_file_url=%s' % source_file_url)

            # TODO: add retrying to this operation.
            source_file = storage.get_filep(source_file_url, sftp_reference=False)


            #logger.debug('source_file=%s' % source_file._name)

            # we have load contents locally at least once.
            f_contents = source_file.read()

            # Make temporary copy as mytardis datafile pos requires filename
            tempfname = os.path.basename(fname)
            with open(os.path.join(staging_dir, tempfname), 'wb') as fp:
                fp.write(f_contents)

            new_datafile_paramset = []
            logger.debug("datafile_paramset=%s" % datafile_paramset)

            for paramset in datafile_paramset:
                new_paramset = {}
                logger.debug("paramset=%s" % paramset)
                new_paramset['schema'] = paramset['schema']

                has_value = False
                has_keys = False
                new_param_vals = []

                for param in paramset['parameters']:
                    new_param = {}

                    for param_key, v in param.iteritems():
                        logger.debug("param_key=%s v=%s" % (param_key,v))
                        if param_key == 'name' and v == "value_dict":
                            new_param['name'] = 'value_dict'
                            new_value = {}

                            found_func_match = False
                            for fn, func in dfile_extract_func.iteritems():
                                logger.debug("fn=%s,func=%s" % (fn, func))
                                if fn == os.path.basename(fname):
                                    # if fn file is very long, this is inefficient
                                    logger.debug("fname=%s" % os.path.join(staging_dir, fn))
                                    with open(
                                          os.path.join(staging_dir, fn),
                                         'r') as fp:
                                        new_value.update(func(fp))
                                    found_func_match = True  # FIXME: can multiple funcs match?
                                    logger.debug("matched %s %s" % (fn, func))

                            logger.debug("new_value=%s" % new_value)

                            new_param['string_value'] = json.dumps(new_value) if found_func_match else param['string_value']

                            break
                        else:
                            # incase string_value is processed first
                            new_param[param_key] = v

                    logger.debug("string_value len=%s" % new_param['string_value'])

                    if new_param['name'] == "value_dict" and len(json.loads(new_param['string_value'])):
                        has_value = True
                    logger.debug("has_value=%s" % has_value)

                    if new_param['name'] == "value_keys" and len(json.loads(new_param['string_value'])):
                        has_keys = True
                    logger.debug("has_keys=%s" % has_keys)

                    new_param_vals.append(new_param)

                new_paramset['parameters'] = new_param_vals

                logger.debug("has_value=%s" % has_value)
                logger.debug("has_keys=%s" % has_keys)

                if has_value or has_keys:
                    new_datafile_paramset.append(new_paramset)
                else:
                    logger.debug("not adding %s" % new_paramset)

            logger.debug("new_datafile_paramset=%s" % new_datafile_paramset)
            file_size = len(f_contents)
            logger.debug("file_size=%s" % file_size)
            if file_size:

                data = json.dumps({
                    u'dataset': str(new_dataset_uri),
                    u'parameter_sets': new_datafile_paramset,
                    u'filename': os.path.basename(fname),
                    u'size': file_size,
                    u'mimetype': 'text/plain',
                    u'md5sum': hashlib.md5(f_contents).hexdigest()
                    })
                logger.debug("data=%s" % data)

                with open(os.path.join(staging_dir, tempfname), 'rb') as fp:

                    r = requests.post(url,
                        data={"json_data": data},
                        headers={'Accept': 'application/json'},
                        files={'attached_file': fp},
                        auth=HTTPBasicAuth(tardis_user, tardis_pass),
                        verify=False
                        )

                    # FIXME: need to check for status_code and handle failures.
                    logger.debug("r.js=%s" % r.json)
                    logger.debug("r.te=%s" % r.text)
                    logger.debug("r.he=%s" % r.headers)

            else:
                logger.warn("not transferring empty file %s" % fname)
                #TODO: check whether mytardis api can accept zero length files

    finally:
        shutil.rmtree(staging_dir)

    return new_exp_id
Esempio n. 28
0
    def build_metadata_for_final_output(self, m, output_dir, **kwargs):
        #FIXME: this calculation should be done as in extract_psd_func
        # pulling directly from data_errors rather than passing in
        # through nested function.
        experiment_paramset = []
        dataset_paramset = []
        datafile_paramset = []
        dfile_extract_func = {}

        exp_value_keys = []
        legends = []
        for m, current_dir in enumerate(kwargs['output_dirs']):
            #node_path = os.path.join(iter_output_dir, node_dir)

            exp_value_keys.append(
                ["hrmcdset%s/step" % m,
                 "hrmcdset%s/err" % m])

            source_url = storage.get_url_with_credentials(\
            kwargs['storage_settings'], current_dir, is_relative_path=False)

            (source_scheme, source_location, source_path, source_location,
             query_settings) = storage.parse_bdpurl(source_url)
            logger.debug("source_url=%s" % source_url)
            legends.append(
                mytardis.get_dataset_name_for_output(
                    kwargs['storage_settings'], "", source_path))

        logger.debug("exp_value_keys=%s" % exp_value_keys)
        logger.debug("legends=%s" % legends)

        # for m, output_dir in enumerate(kwargs['output_dirs']):
        #node_path = os.path.join(iter_output_dir, output_dir)
        node_path = output_dir
        logger.debug("node_path=%s" % node_path)

        dataerrors_url = storage.get_url_with_credentials(
            kwargs['storage_settings'],
            os.path.join(node_path, self.DATA_ERRORS_FILE),
            is_relative_path=False)
        logger.debug("dataerrors_url=%s" % dataerrors_url)
        dataerrors_content = storage.get_file(dataerrors_url)
        xs = []
        ys = []
        re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)')
        for i, line in enumerate(dataerrors_content.splitlines()):
            if i == 0:
                continue
            columns = line.split()
            try:
                hrmc_step = int(columns[self.STEP_COLUMN_NUM])
            except ValueError:
                logger.warn("could not parse hrmc_step value on line %s" % i)
                continue
            # handle  format double precision float format
            val = columns[self.ERRGR_COLUMN_NUM]
            val = re_dbl_fort.sub(r'\1E\2', val)
            logger.debug("val=%s" % val)
            try:
                hrmc_errgr = float(val)
            except ValueError:
                logger.warn("could not parse hrmc_errgr value on line %s" % i)
                continue
            xs.append(hrmc_step)
            ys.append(hrmc_errgr)

        logger.debug("xs=%s" % xs)
        logger.debug("ys=%s" % ys)

        crit_url = storage.get_url_with_credentials(kwargs['storage_settings'],
                                                    os.path.join(
                                                        node_path,
                                                        "criterion.txt"),
                                                    is_relative_path=False)
        try:
            crit = storage.get_file(crit_url)
        except ValueError:
            crit = None
        except IOError:
            crit = None
        # FIXME: can crit be zero?
        logger.debug("crit=%s" % crit)
        if crit:
            system_id = int(getval(kwargs['run_settings'], '%s/system/id' % \
            django_settings.SCHEMA_PREFIX))
            hrmcdset_val = {"hrmcdset/it": system_id, "hrmcdset/crit": crit}
        else:
            hrmcdset_val = {}

        # TODO: move into utiltiy function for reuse
        def extract_psd_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(dataerrors_content.splitlines()):
                if i == 0:
                    continue
                columns = line.split()

                val = columns[self.STEP_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    x = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                val = columns[self.ERRGR_COLUMN_NUM]
                val = re_dbl_fort.sub(r'\1E\2', val)
                logger.debug("val=%s" % val)
                try:
                    y = float(val)
                except ValueError:
                    logger.warn("could not parse value on line %s" % i)
                    continue

                xs.append(x)
                ys.append(y)
            res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys}
            return res

        def extract_psdexp_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys}
            return res

        def extract_grfinal_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [
                xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0)
            ]
            cut_ys = [
                ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0)
            ]

            res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys}
            return res

        def extract_inputgr_func(fp):
            res = []
            xs = []
            ys = []
            for i, line in enumerate(fp):
                columns = line.split()
                xs.append(float(columns[0]))
                ys.append(float(columns[1]))
            #FIXME: len(xs) == len(ys) for this to work.
            #TODO: hack to handle when xs and ys are too
            # large to fit in Parameter with db_index.
            # solved by function call at destination
            cut_xs = [
                xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0)
            ]
            cut_ys = [
                ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0)
            ]

            res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys}
            return res

        #todo: replace self.boto_setttings with mytardis_settings

        # Only save graph paramset for experiment once per experiment.
        if not self.final_graph_paramset:
            self.final_graph_paramset = [
                mytardis.create_graph_paramset("expgraph",
                                               name="hrmcexp2",
                                               graph_info={
                                                   "axes":
                                                   ["step", "ERRGr*wf"],
                                                   "precision": [0, 2],
                                                   "legends": legends
                                               },
                                               value_dict={},
                                               value_keys=exp_value_keys)
            ]

            experiment_paramset = self.final_graph_paramset
        else:
            experiment_paramset = []

        dataset_paramset = [
            mytardis.create_paramset('hrmcdataset/output', []),
            mytardis.create_graph_paramset(
                'dsetgraph',
                name="hrmcdset",
                graph_info={
                    "axes": ["r (Angstroms)", "PSD"],
                    "legends": ["psd", "PSD_exp"],
                    "type": "line"
                },
                value_dict=hrmcdset_val,
                value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"],
                            ["hrmcdfile/r2", "hrmcdfile/g2"]]),
            mytardis.create_graph_paramset(
                'dsetgraph',
                name='hrmcdset2',
                graph_info={
                    "axes": ["r (Angstroms)", "g(r)"],
                    "legends": ["data_grfinal", "input_gr"],
                    "type": "line"
                },
                value_dict={},
                value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"],
                            ["hrmcdfile/r4", "hrmcdfile/g4"]]),
            mytardis.create_graph_paramset('dsetgraph',
                                           name='hrmcdset%s' % m,
                                           graph_info={},
                                           value_dict={
                                               "hrmcdset%s/step" % m: xs,
                                               "hrmcdset%s/err" % m: ys
                                           },
                                           value_keys=[]),
        ]
        datafile_paramset = [
            mytardis.create_graph_paramset('dfilegraph',
                                           name="hrmcdfile",
                                           graph_info={},
                                           value_dict={},
                                           value_keys=[])
        ]
        dfile_extract_func = {
            'psd.dat': extract_psd_func,
            'PSD_exp.dat': extract_psdexp_func,
            'data_grfinal.dat': extract_grfinal_func,
            'input_gr.dat': extract_inputgr_func
        }
        logger.debug("experiment_paramset=%s" % experiment_paramset)
        logger.debug("dataset_paramset=%s" % dataset_paramset)
        logger.debug("datafile_paramset=%s" % datafile_paramset)
        logger.debug("dfile_extract_func=%s" % dfile_extract_func)

        return (experiment_paramset, dataset_paramset, datafile_paramset,
                dfile_extract_func)
Esempio n. 29
0
    def create_dataset_for_final_output(self, run_settings, experiment_id,
                                        base_dir, output_url, all_settings):
        logger.debug("curate_dataset")
        iter_output_dir = os.path.join(os.path.join(base_dir, "output"))
        logger.debug("iter_output_dir=%s" % iter_output_dir)

        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                      all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)
        logger.debug("iter_output_dir=%s" % iter_output_dir)
        logger.debug("output_url=%s" % output_url)
        (scheme, host, mypath, location,
         query_settings) = storage.parse_bdpurl(output_url)
        fsys = storage.get_filesystem(output_url)

        node_output_dirnames, _ = fsys.listdir(mypath)
        logger.debug("node_output_dirnames=%s" % node_output_dirnames)

        curate_data = (getval(
            run_settings,
            '%s/input/mytardis/curate_data' % self.SCHEMA_PREFIX))
        if curate_data:
            if all_settings['mytardis_host']:
                output_dirs = []
                for m, dir_name in enumerate(node_output_dirnames):
                    output_dirs.append(os.path.join(iter_output_dir, dir_name))

                for m, output_dir in enumerate(output_dirs):
                    #node_path = os.path.join(iter_output_dir, node_dir)
                    logger.debug("output_dir=%s" % output_dir)

                    dataset_paramset = []
                    datafile_paramset = []
                    dfile_extract_func = {}
                    self.load_metadata_builder(run_settings)
                    if self.METADATA_BUILDER:
                        (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func) = \
                        self.METADATA_BUILDER.build_metadata_for_final_output(m, output_dir, \
                        run_settings=run_settings, storage_settings=all_settings,\
                        output_dirs=output_dirs)

                    source_url = get_url_with_credentials(
                        all_settings, output_dir, is_relative_path=False)
                    logger.debug("source_url=%s" % source_url)

                    experiment_id = mytardis.create_dataset(
                        settings=all_settings,
                        source_url=source_url,
                        exp_name=mytardis.get_exp_name_for_output,
                        dataset_name=mytardis.get_dataset_name_for_output,
                        exp_id=experiment_id,
                        experiment_paramset=experiment_paramset,
                        dataset_paramset=dataset_paramset,
                        datafile_paramset=datafile_paramset,
                        dfile_extract_func=dfile_extract_func)
                    graph_paramset = []
            else:
                logger.warn("no mytardis host specified")
        else:
            logger.warn('Data curation is off')
        return experiment_id
Esempio n. 30
0
    def process_outputs(self, run_settings, base_dir, input_url, all_settings):

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir, "input_%s" % (id + 1)))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(input_url)
        iter_out_fsys = storage.get_filesystem(input_url)

        input_dirs, _ = iter_out_fsys.listdir(iter_output_path)

        # TODO: store all audit info in single file in input_X directory in transform,
        # so we do not have to load individual files within node directories here.
        min_crit = sys.float_info.max - 1.0
        min_crit_index = sys.maxint

        # # TODO: store all audit info in single file in input_X directory in transform,
        # # so we do not have to load individual files within node directories here.
        # min_crit = sys.float_info.max - 1.0
        # min_crit_index = sys.maxint
        logger.debug("input_dirs=%s" % input_dirs)
        for input_dir in input_dirs:
            node_path = os.path.join(iter_output_dir, input_dir)
            logger.debug('node_path= %s' % node_path)

            # Retrieve audit file

            # audit_url = get_url_with_credentials(output_storage_settings,
            #     output_prefix + os.path.join(self.iter_inputdir, input_dir, 'audit.txt'), is_relative_path=False)
            audit_url = get_url_with_credentials(all_settings, os.path.join(node_path, "audit.txt"), is_relative_path=False)
            audit_content = storage.get_file(audit_url)
            logger.debug('audit_url=%s' % audit_url)

            # extract the best criterion error
            # FIXME: audit.txt is potentially debug file so format may not be fixed.
            p = re.compile("Run (\d+) preserved \(error[ \t]*([0-9\.]+)\)", re.MULTILINE)
            m = p.search(audit_content)
            criterion = None
            if m:
                criterion = float(m.group(2))
                best_numb = int(m.group(1))
                # NB: assumes that subdirss in new input_x will have same names as output dir that created it.
                best_node = input_dir
            else:
                message = "Cannot extract criterion from audit file for iteration %s" % (self.id + 1)
                logger.warn(message)
                raise IOError(message)

            if criterion < min_crit:
                min_crit = criterion
                min_crit_index = best_numb
                min_crit_node = best_node

        logger.debug("min_crit = %s at %s" % (min_crit, min_crit_index))

        if min_crit_index >= sys.maxint:
            raise BadInputException("Unable to find minimum criterion of input files")

        # get previous best criterion
        try:
            self.prev_criterion = float(getval(run_settings, '%s/converge/criterion' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            self.prev_criterion = sys.float_info.max - 1.0
            logger.warn("no previous criterion found")

        # check whether we are under the error threshold
        logger.debug("best_num=%s" % best_numb)
        logger.debug("prev_criterion = %f" % self.prev_criterion)
        logger.debug("min_crit = %f" % min_crit)
        logger.debug('Current min criterion: %f, Prev '
                     'criterion: %f' % (min_crit, self.prev_criterion))
        difference = self.prev_criterion - min_crit
        logger.debug("Difference %f" % difference)

        try:
            max_iteration = int(getval(run_settings, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            raise BadInputException("unknown max_iteration")
        logger.debug("max_iteration=%s" % max_iteration)

        try:
            self.error_threshold = float(getval(run_settings, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            raise BadInputException("uknown error threshold")
        logger.debug("error_threshold=%s" % self.error_threshold)

        if self.id >= (max_iteration - 1):
            logger.debug("Max Iteration Reached %d " % self.id)
            return (True, min_crit)

        elif min_crit <= self.prev_criterion and difference <= self.error_threshold:
            logger.debug("Convergence reached %f" % difference)
            return (True, min_crit)

        else:
            if difference < 0:
                logger.debug("iteration diverged")
            logger.debug("iteration continues: %d iteration so far" % self.id)

        return (False, min_crit)
Esempio n. 31
0
    def process_outputs(self, run_settings, base_dir, input_url, all_settings):

        id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA))
        iter_output_dir = os.path.join(os.path.join(base_dir, "input_%s" % (id + 1)))
        output_prefix = '%s://%s@' % (all_settings['scheme'],
                                    all_settings['type'])
        iter_output_dir = "%s%s" % (output_prefix, iter_output_dir)

        (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(input_url)
        iter_out_fsys = storage.get_filesystem(input_url)

        input_dirs, _ = iter_out_fsys.listdir(iter_output_path)

        # TODO: store all audit info in single file in input_X directory in transform,
        # so we do not have to load individual files within node directories here.
        min_crit = sys.float_info.max - 1.0
        min_crit_index = sys.maxint

        # # TODO: store all audit info in single file in input_X directory in transform,
        # # so we do not have to load individual files within node directories here.
        # min_crit = sys.float_info.max - 1.0
        # min_crit_index = sys.maxint
        logger.debug("input_dirs=%s" % input_dirs)
        for input_dir in input_dirs:
            node_path = os.path.join(iter_output_dir, input_dir)
            logger.debug('node_path= %s' % node_path)

            # Retrieve audit file

            # audit_url = get_url_with_credentials(output_storage_settings,
            #     output_prefix + os.path.join(self.iter_inputdir, input_dir, 'audit.txt'), is_relative_path=False)
            audit_url = get_url_with_credentials(all_settings, os.path.join(node_path, "audit.txt"), is_relative_path=False)
            audit_content = storage.get_file(audit_url)
            logger.debug('audit_url=%s' % audit_url)

            # extract the best criterion error
            # FIXME: audit.txt is potentially debug file so format may not be fixed.
            p = re.compile("Run (\d+) preserved \(error[ \t]*([0-9\.]+)\)", re.MULTILINE)
            m = p.search(audit_content)
            criterion = None
            if m:
                criterion = float(m.group(2))
                best_numb = int(m.group(1))
                # NB: assumes that subdirss in new input_x will have same names as output dir that created it.
                best_node = input_dir
            else:
                message = "Cannot extract criterion from audit file for iteration %s" % (self.id + 1)
                logger.warn(message)
                raise IOError(message)

            if criterion < min_crit:
                min_crit = criterion
                min_crit_index = best_numb
                min_crit_node = best_node

        logger.debug("min_crit = %s at %s" % (min_crit, min_crit_index))

        if min_crit_index >= sys.maxint:
            raise BadInputException("Unable to find minimum criterion of input files")

        # get previous best criterion
        try:
            self.prev_criterion = float(getval(run_settings, '%s/converge/criterion' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            self.prev_criterion = sys.float_info.max - 1.0
            logger.warn("no previous criterion found")

        # check whether we are under the error threshold
        logger.debug("best_num=%s" % best_numb)
        logger.debug("prev_criterion = %f" % self.prev_criterion)
        logger.debug("min_crit = %f" % min_crit)
        logger.debug('Current min criterion: %f, Prev '
                     'criterion: %f' % (min_crit, self.prev_criterion))
        difference = self.prev_criterion - min_crit
        logger.debug("Difference %f" % difference)

        try:
            max_iteration = int(getval(run_settings, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA))
        except (ValueError, SettingNotFoundException):
            raise BadInputException("unknown max_iteration")
        logger.debug("max_iteration=%s" % max_iteration)

        try:
            self.error_threshold = float(getval(run_settings, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA))
        except (SettingNotFoundException, ValueError):
            raise BadInputException("uknown error threshold")
        logger.debug("error_threshold=%s" % self.error_threshold)

        if self.id >= (max_iteration - 1):
            logger.debug("Max Iteration Reached %d " % self.id)
            return (True, min_crit)

        elif min_crit <= self.prev_criterion and difference <= self.error_threshold:
            logger.debug("Convergence reached %f" % difference)
            return (True, min_crit)

        else:
            if difference < 0:
                logger.debug("iteration diverged")
            logger.debug("iteration continues: %d iteration so far" % self.id)

        return (False, min_crit)
Esempio n. 32
0
def create_dataset(settings,
        source_url,
        exp_id,
        exp_name=_get_exp_name,
        dataset_name=_get_dataset_name,
        experiment_paramset=[],
        dataset_paramset=[],
        datafile_paramset=[],
        dfile_extract_func=None):
    """

    Notes:
        POST to mytardis_host REST API with mytardis_user and mytardis_password
        credentials to create or update experiment for a new dataset containing
        datafiles from source_url BDP directory.

    Args:
        settings:

        source_url: url containing data to be ingested
        exp_id:
        [exp_name,dataset_name]:  functions that return new
    experiment and dataset names respectively based on url and path
        experiment_paramset: ...
        dataset_paramset: ...
        datafile_paramset:
        dfile_extract_func:


    FIXME,TODO: What if tardis in unavailable?  Connection to mytardis probably
    better handled as sperate celery subtask, which can retry until working and
    be async

    FIXME: missing all error checking and retrying of connection to mytardis.
    Reliability framework should be able to supply this?
    """

    #TODO: method should take BDP url source_url not, expanded one.

    logger.debug("post_dataset")
    tardis_user = settings["mytardis_user"]
    tardis_pass = settings["mytardis_password"]
    tardis_host_url = "http://%s" % settings["mytardis_host"]
    logger.debug("posting dataset from %s to mytardis at %s with %s" % (source_url,
        tardis_host_url, tardis_pass))

    (source_scheme, source_location, source_path, source_location,
        query_settings) = storage.parse_bdpurl(source_url)

    logger.debug("source_path=%s" % source_path)

    if source_scheme == "file":
        root_path = _get_value('root_path', query_settings)
    else:
        logger.debug('schema=%s' % source_scheme)
        #raise InvalidInputError("only file source_schema supported for source of mytardis transfer")

    expname = exp_name(settings, source_url, source_path)
    new_exp_id = create_experiment(settings, exp_id, expname, experiment_paramset)

    new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id

    # TODO: check that we do not alreay have a dataset with
    # the same name and overwrite or don't move.
    # save dataset
    logger.debug("saving dataset in experiment at %s" % new_exp_id)
    url = "%s/api/v1/dataset/?format=json" % tardis_host_url
    headers = {'content-type': 'application/json'}

    # # FIXME: schema should be a parameter
    # schemas = [{
    #            "schema": "http://rmit.edu.au/schemas/hrmcdataset",
    #            "parameters": []
    #           }]
    # if dataset_schema:
    #    schemas.append({
    #        "schema": dataset_schema,
    #        "parameters": []
    #        })

    schemas = dataset_paramset

    logger.debug("schemas=%s" % schemas)
    data = json.dumps({
       'experiments': [new_experiment_uri],
       'description': dataset_name(settings, source_url, source_path),
       "parameter_sets": schemas
           })
    logger.debug("data=%s" % data)
    logger.debug("post to %s" % url)
    r = requests.post(url, data=data, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass))
    # FIXME: need to check for status_code and handle failures.

    logger.debug("r.json=%s" % r.json)
    logger.debug("r.text=%s" % r.text)
    logger.debug("r.headers=%s" % r.headers)
    header_location = r.headers['location']
    new_dataset_uri = header_location[len(tardis_host_url):]

    # move files across
    source_files = storage.list_all_files(source_url)
    logger.debug("source_files=%s" % source_files)
    url = "%s/api/v1/dataset_file/" % tardis_host_url
    headers = {'Accept': 'application/json'}

    args = source_url.split('?')[1]

    logger.debug('args=%s' % args)
    '''
    psd_url = smartconnectorscheduler.get_url_with_credentials(output_storage_credentials,
                        'ssh://unix@' + os.path.join(self.output_dir,
                            node_output_dir, "PSD_output", "psd.dat"), is_relative_path=False)
        logger.debug('psd_url=%s' % psd_url)

        psd = hrmcstages.storage.get_filep(psd_url)
    '''
    for file_location in source_files:
        logger.debug('file_location=%s' % os.path.join(source_location, file_location))
        source_file_url = "%s://%s?%s" % (source_scheme, os.path.join(source_location, file_location), args)
        logger.debug('source_file_url=%s' % source_file_url)
        source_file, source_file_ref = storage.get_filep(source_file_url, sftp_reference=True)
        logger.debug('source_file=%s' % source_file._name)
        #file_path = os.path.join(root_path, file_location)
        #file_path = os.path.join(source_url, file_location)
        #logger.debug("file_path=%s" % file_path)
        #logger.debug("content=%s" % open(file_path,'rb').read())

        new_datafile_paramset = []
        logger.debug("datafile_paramset=%s" % datafile_paramset)
        for paramset in datafile_paramset:
            new_paramset = {}
            logger.debug("paramset=%s" % paramset)
            new_paramset['schema'] = paramset['schema']

            has_value = False
            has_keys = False
            new_param_vals = []
            for param in paramset['parameters']:
                new_param = {}
                for param_key, v in param.items():

                    if param_key == 'name' and v == "value_dict":
                        new_param['name'] = 'value_dict'
                        new_value = {}

                        #val = param['string_value']

                        # if not isinstance(val, basestring):
                        #     dfile_extract_func = val

                        found_func_match = False
                        for fname, func in dfile_extract_func.items():
                            logger.debug("fname=%s,func=%s" % (fname, func))
                            if fname == os.path.basename(file_location):
                                #new_value.update(func(open(file_path, 'r')))
                                source_file.seek(0)
                                new_value.update(func(source_file))

                                found_func_match = True  # FIXME: can multiple funcs match?

                        logger.debug("new_value=%s" % new_value)

                        if found_func_match:
                            new_param['string_value'] = json.dumps(new_value)
                        else:
                            new_param['string_value'] = param['string_value']
                        break
                    else:
                        # incase string_value is processed first
                        new_param[param_key] = v

                if new_param['name'] == "value_dict" and len(json.loads(new_param['string_value'])):
                    has_value = True
                if new_param['name'] == "value_keys" and len(json.loads(new_param['string_value'])):
                    has_keys = True
                new_param_vals.append(new_param)

            new_paramset['parameters'] = new_param_vals

            logger.debug("has_value=%s" % has_value)
            logger.debug("has_keys=%s" % has_keys)

            if has_value or has_keys:
                new_datafile_paramset.append(new_paramset)
            else:
                logger.debug("not adding %s" % new_paramset)

        logger.debug("new_datafile_paramset=%s" % new_datafile_paramset)
        logger.debug("file_namee=%s" % source_file._name)
        file_size = source_file_ref.size(source_file._name)
        logger.debug("file_size=%s" % file_size)
        if file_size > 0:
            source_file.seek(0)
            data = json.dumps({
                'dataset': str(new_dataset_uri),
                "parameter_sets": new_datafile_paramset,
                'filename': os.path.basename(file_location),
                #'filename': os.path.basename(file_path),
                'size': file_size,
                'mimetype': 'text/plain',
                'md5sum': hashlib.md5(source_file.read()).hexdigest()
                #'md5sum': hashlib.md5(open(file_path, 'r').read()).hexdigest()
                })
            logger.debug("data=%s" % data)
            #import pdb; pdb.set_trace()
            source_file.seek(0)
            #logger.debug(source_file.read())
            source_file.seek(0)
            r = requests.post(url, data={'json_data': data}, headers=headers,
                files={'attached_file': source_file},  # open(file_path, 'rb')},
                auth=HTTPBasicAuth(tardis_user, tardis_pass)
                )

            # FIXME: need to check for status_code and handle failures.

            logger.debug("r.js=%s" % r.json)
            logger.debug("r.te=%s" % r.text)
            logger.debug("r.he=%s" % r.headers)
        else:
            logger.warn("not transferring empty file %s" % file_location)
            #TODO: check whether mytardis api can accept zero length files

    return new_exp_id