Beispiel #1
0
def search_cmd(command, output_format=None):
    '''
    Instantiates a Search object to run a search using the given input
    file and space.

    '''
    from flame.search import Search

    # ** DEPRECATE **
    # this is a back-compatibility trick for older versions of APIs 
    # not supporting the label argument

    if 'label' not in command:
        command['label'] = 'temp'

    # safety check if model exists
    space_dir = utils.space_path(command['space'], 0)
    if not os.path.isdir(space_dir):
        return False, 'Endpoint name not found in space repository.'

    search = Search(command['space'], version=command['version'], output_format=output_format, label=command['label'])

    if utils.isSingleThread():
        search.set_single_CPU()

    success, results = search.run(command)

    LOG.info('Search completed...')

    return success, results
Beispiel #2
0
def sbuild_cmd(arguments, output_format=None):
    '''
    Instantiates a Sbuild object to build a chemical space using the given
    input file and model. 

    '''

    from flame.sbuild import Sbuild

    # safety check if model exists
    space_dir = utils.space_path(arguments['space'], 0)
    if not os.path.isdir(space_dir):
        return False, 'Endpoint name not found in space repository.'

    # remove pre-existing results file
    results_file = os.path.join(space_dir, 'space-results.pkl')
    if os.path.isfile(results_file):
        os.remove(results_file)
    meta_file = os.path.join(space_dir, 'space-meta.pkl')
    if os.path.isfile(meta_file):
        os.remove(meta_file)

    if 'param_string' in arguments:
        sbuild = Sbuild(arguments['space'],
                        param_string=arguments['param_string'],
                        output_format=output_format)
    elif 'param_file' in arguments:
        sbuild = Sbuild(arguments['space'],
                        param_file=arguments['param_file'],
                        output_format=output_format)
    else:
        sbuild = Sbuild(arguments['space'], output_format=output_format)

    if utils.isSingleThread():
        sbuild.set_single_CPU()

    ifile = arguments['infile']
    epd = utils.space_path(arguments['space'], 0)
    lfile = os.path.join(epd, 'training_series')

    # when a new training series is provided in the command line
    # try to copy it to the model directory
    if ifile is not None:
        if not os.path.isfile(ifile):
            return False, f'Wrong compound database file {ifile}'
        try:
            safe_copy(ifile, lfile)
            # shutil.copy(ifile, lfile)
        except:
            return False, 'Unable to copy input file to space directory'

    # check that the local copy of the input file exists
    if not os.path.isfile(lfile):
        return False, 'No compound database found'

    # run the space building with the input file
    success, results = sbuild.run(lfile)

    return success, results
Beispiel #3
0
def get_ensemble_input(task, model_names, model_versions, infile):
    '''
    Manage obtention of input data from a list of models
    '''

    num_models = len (model_names)
    
    # when there are multiple external sources it is more convenient parallelize the 
    # models than to run internal task in parallel

    parallel = (utils.isSingleThread() == False and num_models > MAX_MODELS_SINGLE_CPU)
    
    # disables internal parallelism
    # if parallel:
    #     task.set_single_CPU() 

    # add input molecule to the model input definition of every internal model
    model_suc = []  # True / False
    model_res = []  # conveyor or every prediction, as produced by odata.run_apply

    model_cmd = []
    for i in range(num_models):
        model_cmd.append({'endpoint': model_names[i],
                          'version': model_versions[i],
                          'infile': infile,
                          'output_format': 'ghost',
                          'label': f'ensemble{i}'})

    # run in multithreading
    if parallel:
        # import multiprocessing as mp

        LOG.info(f'Runing {num_models} threads in parallel')       
       
        # pool = mp.Pool(len(model_cmd))
        # model_tmp = pool.map(predict_cmd, model_cmd)

        from joblib import Parallel, delayed
        model_tmp = Parallel(n_jobs=num_models)(delayed(predict_cmd)(model_cmd[i]) for i in range(num_models))

        for iresult in model_tmp:
            model_suc.append(iresult[0])
            model_res.append(iresult[1])
    
    # run in a single thread
    else:
        for i in range(num_models):
            success, results = predict_cmd(model_cmd[i])
            model_suc.append(success)
            model_res.append(results)

    if False in model_suc:
        return False, 'Some external input sources failed: '+str(model_suc)

    LOG.info('External input computed')

    return True, model_res
Beispiel #4
0
def build_cmd(arguments, output_format=None):
    '''
    Instantiates a Build object to build a model using the given
    input file and model. 

    This method must be self-contained and suitable for being called in
    cascade, by models which use the output of other models as input
    '''
    
    from flame.build import Build

    # safety check if model exists
    endpoint_dir = utils.model_path(arguments['endpoint'], 0)
    if not os.path.isdir(endpoint_dir):
        return False, 'Endpoint name not found in model repository.'

    # remove pre-existing results file
    results_file = os.path.join(endpoint_dir, 'model-results.pkl')
    if os.path.isfile(results_file):
        os.remove(results_file)

    meta_file = os.path.join(endpoint_dir, 'model-meta.pkl')
    if os.path.isfile(meta_file):
        os.remove(meta_file)

    # input file provided in the command
    ifile = arguments['infile']
    if ifile is not None and not os.path.isfile (ifile):
        return False, f'Wrong training series file {ifile}'

    # lfile is the "training_series" copied internally to the endpoint folder
    endpoint_path = utils.model_path(arguments['endpoint'], 0)
    lfile = os.path.join(endpoint_path, 'training_series')

    if 'param_file' in arguments:
        build = Build(arguments['endpoint'], param_file=arguments['param_file'], output_format=output_format)
    elif 'param_string' in arguments:
        build = Build(arguments['endpoint'], param_string=arguments['param_string'], output_format=output_format)
    else:
        build = Build(arguments['endpoint'], output_format=output_format)

    if utils.isSingleThread():
        build.set_single_CPU()

    ensemble = build.get_ensemble()

    # ensemble[0]     Boolean with True for ensemble models and False otherwyse
    # ensemble[1]     List of ensemble model model_names
    # ensemble[2]     List of ensemble model versions

    if ensemble[0]:

        emodels = ensemble[1]
        evers   = ensemble[2]

        if ifile is None:
            if not os.path.isfile (lfile):
                return False, 'no training series detected'
        else:
            try:
                safe_copy(ifile, lfile)
                # shutil.copy(ifile, lfile)
            except:
                return False, 'Unable to copy input file to model directory'
        
        success, model_res = get_ensemble_input(build, emodels, evers, lfile)

        if not success:
            return False, model_res

        for i in range(len (emodels)):
            success, iID = utils.getModelID(emodels[i], evers[i], 'model')
            if success:
                build.extend_modelID(iID)

        LOG.debug(f'New modelID is: {build.conveyor.getMeta("modelID")}')

        # now run the model using the data from the external sources
        success, results = build.run(model_res)

    else:

        # when a new training series is provided in the command line
        # try to copy it to the model directory
        if ifile is not None:

            # in case of incremental training, add the input file at the end of existing file
            if arguments['incremental']:
                if arguments['incremental'] and os.path.isfile(lfile):
                    LOG.info(f'Merging file {ifile} with existing training series')
                    new_training = os.path.join(endpoint_path, 'temp_training')

                    with open(new_training, 'w') as outfile:

                        # handling the extra newline of SDFiles is problematic. We are delaying the
                        # output of the newline by striping newlines and adding an universal newline
                        # at the next line for the first block  
                        first = True
                        with codecs.open(lfile, 'r', encoding='utf-8', errors='ignore') as infile:
                            for line in infile:
                                if first:
                                    outfile.write(f'{line.rstrip()}')
                                    first = False
                                else:
                                    outfile.write(f'\n{line.rstrip()}')

                        # for the second block we add the preceding newline in all lines 
                        with codecs.open(ifile, 'r', encoding='utf-8', errors='ignore') as infile:
                            for line in infile:
                                outfile.write(f'\n{line.rstrip()}')

                    shutil.move(new_training, lfile)
            else:
                try:
                    safe_copy (ifile, lfile)
                    # shutil.copy(ifile, lfile)
                except:
                    return False, 'Unable to copy input file to model directory'

        # check that the local copy of the input file exists
        if not os.path.isfile(lfile):
            return False, 'No training series found'

        # run the model with the input file
        success, results = build.run(lfile)

    return success, results
Beispiel #5
0
def predict_cmd(arguments, output_format=None):
    '''
    Instantiates a Predict object to run a prediction using the given input
    file and model.

    This method must be self-contained and suitable for being called in
    cascade, by models which use the output of other models as input.
    '''
    from flame.predict import Predict

    # safety check if model exists
    endpoint_dir = utils.model_path(arguments['endpoint'], 0)
    if not os.path.isdir(endpoint_dir):
        return False, 'Endpoint name not found in model repository.'

    # ** DEPRECATE **
    # this is a back-compatibility trick for older versions of APIs 
    # not supporting the label argument
    if 'label' not in arguments:
        arguments['label'] = 'temp'

    if 'output_format' in arguments:
        output_format = arguments['output_format']

    predict = Predict(arguments['endpoint'], version=arguments['version'],  output_format=output_format, label=arguments['label'])

    if utils.isSingleThread():
        predict.set_single_CPU()

    ensemble = predict.get_ensemble()

    # ensemble[0]     Boolean with True for ensemble models and False otherwyse
    # ensemble[1]     List of ensemble model model_names
    # ensemble[2]     List of ensemble model versions

    if ensemble[0]:

        if arguments['infile'] is None:
            return False, 'ensemble models require allways an input file'

        emodels = ensemble[1]
        evers   = ensemble[2]

        success, model_res = get_ensemble_input(predict, emodels, evers, arguments['infile'])

        if not success:
            predict.conveyor.setError (model_res)
            LOG.error (model_res)
            # return False, model_res        # TO-DO, comment this line and run prediction to allow odata to generate error info

        # check the presence of changes in the inner models
        modelID = predict.conveyor.getMeta('modelID')
        for i in range(len (emodels)):
            success, iID = utils.getModelID(emodels[i], evers[i], 'model')
            if success:
                if iID not in modelID:
                    predict.conveyor.setWarning (f'Inner model {emodels[i]}.{evers[i]} has been updated. Rebuilding of ensemble model is recommended')
                    LOG.warning (f'Inner model {emodels[i]}.{evers[i]} has been updated. Rebuilding of ensemble model is recommended')

        # now run the model using the data from the external sources
        success, results = predict.run(model_res)

    else:

        # run the model with the input file
        success, results = predict.run(arguments['infile'])

    LOG.info('Prediction completed...')

    return success, results