Ejemplo n.º 1
0
def calc_descriptor_in_memory(descriptor,
                              configs,
                              desc_file,
                              ase_atoms_list,
                              tmp_folder=None,
                              desc_folder=None,
                              desc_info_file=None,
                              target_list=None,
                              operations_on_structure=None,
                              nb_jobs=-1,
                              **kwargs):
    """ Calculates the descriptor for a list of atomic structures.

    Starting from a list of ASE structures, calculates for each file the descriptor
    specified by ``descriptor``, and stores the results in the compressed archive
    desc_file in the directory `desc_folder`.

    Parameters:

    descriptor: :py:mod:`ai4materials.descriptors.base_descriptor.Descriptor` object
        Descriptor to calculate.

    configs: dict
        Contains configuration information such as folders for input and output (e.g. desc_folder, tmp_folder),
        logging level, and metadata location. See also :py:mod:`ai4materials.utils.utils_config.set_configs`.

    ase_atoms_list: list of ``ase.Atoms`` objects
        Atomic structures.

    desc_file: string
        Name of the compressed archive where the file containing the descriptors are written.

    desc_folder: string, optional (default=`None`)
        Folder where the desc_file is written. If not specified, the desc_folder in read from
        ``configs['io']['desc_folder']``.

    tmp_folder: string, optional (default=`None`)
        Folder where the desc_file is written. If not specified, the desc_folder in read from
        ``configs['io']['tmp_folder']``.

    desc_info_file: string, optional (default=`None`)
        File where information about the descriptor are written to disk.

    target_list: list, optional (default=`None`)
        List of target values. These values are saved to disk when the descriptor is calculated, and they can loaded
        for subsequent analysis.

    operations_on_structure: list of objects
        List of operations to be applied to the atomic structures before calculating the descriptor.

    nb_jobs: int, optional (default=-1)
        Number of processors to use in the calculation of the descriptor.
        If set to -1, all available processors will be used.


    .. codeauthor:: Angelo Ziletti <*****@*****.**>

    """

    if desc_info_file is None:
        desc_info_file = os.path.abspath(
            os.path.normpath(os.path.join(desc_folder, 'desc_info.json.info')))

    desc_file = os.path.abspath(
        os.path.normpath(os.path.join(desc_folder, desc_file)))

    # make the log file empty (do not erase it because otherwise
    # we have problems with permission on the Docker image)
    outfile_path = os.path.join(tmp_folder, 'output.log')
    open(outfile_path, 'w+')

    # remove control file from a previous run
    old_control_files = [
        f for f in os.listdir(tmp_folder) if f.endswith('control.json')
    ]
    for old_control_file in old_control_files:
        file_path = os.path.join(desc_folder, old_control_file)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
        except Exception as e:
            logger.error(e)

    tar = tarfile.open(desc_file, 'w:gz')

    if nb_jobs == -1:
        nb_jobs = min(len(ase_atoms_list), multiprocessing.cpu_count())

    # overwrite configs (priority is given to the folders defined in the function)
    # if desc_folder and tmp_folder are None, then configs are not overwritten
    configs = overwrite_configs(configs=configs,
                                desc_folder=desc_folder,
                                tmp_folder=tmp_folder)

    # define desc_folder and tmp_folder for convenience
    desc_folder = configs['io']['desc_folder']
    tmp_folder = configs['io']['tmp_folder']

    with ProcessPoolExecutor(max_workers=nb_jobs) as executor:
        ase_atoms_list_with_op_nested = executor.map(
            worker_apply_operations, ((ase_atoms, operations_on_structure)
                                      for ase_atoms in ase_atoms_list))
    ase_atoms_list_with_op = [
        item for sublist in ase_atoms_list_with_op_nested for item in sublist
    ]

    # check if all elements in the ase list have labels (needed for traceability later)
    label_present = [
        True if 'label' in ase_atoms.info.keys() else False
        for ase_atoms in ase_atoms_list_with_op
    ]
    if not np.all(label_present):
        logger.info(
            "Some structures in the list do not have labels. Adding or substituting labels."
        )
        logger.info(
            "Default labels given by the order in the list (1st structure: label=struct-1)"
        )
        logger.info(
            "To avoid this add a label to each ASE structure using ase_atoms.info['label']='your_label'"
        )

        # substitute and add default labels
        for idx, ase_atoms in enumerate(ase_atoms_list_with_op):
            ase_atoms.info['label'] = str('struct-' + str(idx))

    logger.info('Using {} processors'.format(nb_jobs))

    # load descriptor metadata
    desc_metainfo = get_metadata_info()
    allowed_descriptors = desc_metainfo['descriptors']

    # add target to structures in the list
    if target_list is not None:
        for idx_atoms, ase_atoms in enumerate(ase_atoms_list_with_op):
            ase_atoms.info['target'] = target_list[idx_atoms]

    if descriptor.name in allowed_descriptors:
        logger.info("Calculating descriptor: {0}".format(descriptor.name))

        worker_calc_descriptor = partial(
            calc_descriptor_one_structure,
            descriptor=descriptor,
            allowed_descriptors=allowed_descriptors,
            configs=configs,
            idx_slice=0,
            desc_file=desc_file,
            desc_folder=desc_folder,
            desc_info_file=desc_info_file,
            tmp_folder=tmp_folder,
            target_list=target_list,
            **kwargs)

        ase_atoms_results = parallel_process(ase_atoms_list_with_op,
                                             worker_calc_descriptor,
                                             nb_jobs=nb_jobs)

    else:
        raise ValueError(
            "Please provided a valid descriptor. Valid descriptors are {}".
            format(allowed_descriptors))

    logger.info("Calculation done.")

    logger.info('Writing descriptor information to file.')

    for idx_atoms, ase_atoms in enumerate(ase_atoms_results):
        descriptor.write(ase_atoms, tar=tar, op_id=0)
        write_ase_db_file(ase_atoms, configs, tar=tar, op_nb=0)

        # we assume that the target value does not change with the application of the operations
        write_target_values(ase_atoms, configs, op_nb=0, tar=tar)

    # write descriptor info to file for future reference
    write_desc_info_file(descriptor, desc_info_file, tar, ase_atoms_results)

    tar.close()

    desc_file_master = write_summary_file(descriptor,
                                          desc_file,
                                          tmp_folder,
                                          desc_file_master=desc_file +
                                          '.tar.gz',
                                          clean_tmp=False)

    clean_folder(tmp_folder)
    clean_folder(desc_folder,
                 endings_to_delete=(".png", ".npy", "_target.json", "_aims.in",
                                    "_info.pkl", "_coord.in",
                                    "_ase_atoms.json"))

    logger.info('Descriptor file: {}'.format(desc_file_master))

    return desc_file_master
Ejemplo n.º 2
0
def calc_descriptor(descriptor,
                    configs,
                    desc_file,
                    ase_atoms_list,
                    tmp_folder=None,
                    desc_folder=None,
                    desc_info_file=None,
                    target_list=None,
                    operations_on_structure=None,
                    nb_jobs=-1,
                    **kwargs):
    """ Calculates the descriptor for a list of atomic structures.

    Starting from a list of ASE structures, calculates for each file the descriptor
    specified by ``descriptor``, and stores the results in the compressed archive
    desc_file in the directory `desc_folder`.
    It uses multiprocessing.Pool to parallelize the calculation.

    Parameters:

    descriptor: :py:mod:`ai4materials.descriptors.base_descriptor.Descriptor` object
        Descriptor to calculate.

    configs: dict
        Contains configuration information such as folders for input and output (e.g. desc_folder, tmp_folder),
        logging level, and metadata location. See also :py:mod:`ai4materials.utils.utils_config.set_configs`.

    ase_atoms_list: list of ``ase.Atoms`` objects
        Atomic structures.

    desc_file: string
        Name of the compressed archive where the file containing the descriptors are written.

    desc_folder: string, optional (default=`None`)
        Folder where the desc_file is written. If not specified, the desc_folder in read from
        ``configs['io']['desc_folder']``.

    tmp_folder: string, optional (default=`None`)
        Folder where the desc_file is written. If not specified, the desc_folder in read from
        ``configs['io']['tmp_folder']``.

    desc_info_file: string, optional (default=`None`)
        File where information about the descriptor are written to disk.

    target_list: list, optional (default=`None`)
        List of target values. These values are saved to disk when the descriptor is calculated, and they can loaded
        for subsequent analysis.

    operations_on_structure: list of objects
        List of operations to be applied to the atomic structures before calculating the descriptor.

    nb_jobs: int, optional (default=-1)
        Number of processors to use in the calculation of the descriptor.
        If set to -1, all available processors will be used.


    .. codeauthor:: Angelo Ziletti <*****@*****.**>

    """

    if nb_jobs == -1:
        nb_jobs = min(len(ase_atoms_list), multiprocessing.cpu_count())

    # overwrite configs (priority is given to the folders defined in the function)
    # if desc_folder and tmp_folder are None, then configs are not overwritten
    configs = overwrite_configs(configs=configs,
                                desc_folder=desc_folder,
                                tmp_folder=tmp_folder)

    # define desc_folder and tmp_folder for convenience
    desc_folder = configs['io']['desc_folder']
    tmp_folder = configs['io']['tmp_folder']

    pool = multiprocessing.Pool(processes=nb_jobs)
    ase_atoms_list_with_op_nested = pool.map(
        worker_apply_operations,
        ((ase_atoms, operations_on_structure) for ase_atoms in ase_atoms_list))
    ase_atoms_list_with_op = [
        item for sublist in ase_atoms_list_with_op_nested for item in sublist
    ]
    pool.close()
    pool.join()

    # check if all elements in the ase list have labels (needed for traceability later)
    label_present = [
        True if 'label' in ase_atoms.info.keys() else False
        for ase_atoms in ase_atoms_list_with_op
    ]
    if not np.all(label_present):
        logger.info(
            "Some structures in the list do not have labels. Adding or substituting labels."
        )
        logger.info(
            "Default labels given by the order in the list (1st structure: label=struct-1)"
        )
        logger.info(
            "To avoid this add a label to each ASE structure using ase_atoms.info['label']='your_label'"
        )

        # substitute and add default labels
        for idx, ase_atoms in enumerate(ase_atoms_list_with_op):
            ase_atoms.info['label'] = str('struct-' + str(idx))

    def _calc_descriptor_mp(data_slice, desc_file_i, idx_slice):
        _calc_descriptor(ase_atoms_list=data_slice,
                         desc_file=desc_file_i,
                         idx_slice=idx_slice,
                         descriptor=descriptor,
                         configs=configs,
                         logger=logger,
                         tmp_folder=tmp_folder,
                         desc_folder=desc_folder,
                         desc_info_file=desc_info_file,
                         target_list=target_list,
                         **kwargs)

    logger.info('Using {} processors'.format(nb_jobs))
    dispatch_jobs(_calc_descriptor_mp,
                  ase_atoms_list_with_op,
                  nb_jobs=nb_jobs,
                  desc_folder=desc_folder,
                  desc_file=desc_file)

    desc_file_master = collect_desc_folders(descriptor=descriptor,
                                            desc_folder=desc_folder,
                                            nb_jobs=nb_jobs,
                                            tmp_folder=tmp_folder,
                                            desc_file=desc_file,
                                            remove=True)

    # the cleaning of the tmp folder does not work if it is put here
    clean_folder(tmp_folder)
    clean_folder(desc_folder,
                 endings_to_delete=(".png", ".npy", "_target.json", "_aims.in",
                                    "_info.pkl", "_coord.in",
                                    "_ase_atoms.json"))

    logger.info('Descriptor file: {}'.format(desc_file_master))

    return desc_file_master
Ejemplo n.º 3
0
def prepare_dataset(structure_list,
                    target_list,
                    desc_metadata,
                    dataset_name,
                    target_name,
                    input_dims,
                    configs,
                    target_categorical=True,
                    dataset_folder=None,
                    desc_folder=None,
                    main_folder=None,
                    tmp_folder=None,
                    disc_type=None,
                    n_bins=100,
                    notes=None,
                    new_labels=None):
    """For a list of `ase.Atoms`, a `target_list`, and a `target_name` creates a dataset and writes it to file.

    Information regarding the dataset are saved in a summary file (ending with "_summary.json"). This includes for
    example creation date, path to the pickles containing the feature matrix (ending with "_x.pkl") and the labels
    (ending with "_y.pkl"), `dataset_name`, `target_name`, `text_labels`, and user-defined notes on the
    dataset.

    The dataset written to file by `ai4materials.preprocessing.prepare_dataset` can be later loaded by
    `ai4materials.preprocessing.load_dataset_from_file`.

    Parameters:

    structure_list: list of `ase.Atoms`
        List of atomic structures.

    target_list: list of dict
        List of dictionaries as returned by `nomad-ml.wrappers.load_descriptor`. \n
        Each element of this list is a dictionary with only one key (data), \n
        which has as value a list of dicts. \n
        For example: \n
        {u’data’: [{u’spacegroup_symbol_symprec_0.001’: 194, u’chemical_formula’: u’Ac258’}]}. \n
        More keywords are possible.

    desc_metadata: str
        Metadata of the descriptor to be extracted from `ase.Atoms.info` dictionary.

    dataset_name: str
        Name to give to the dataset.

    target_name: str
        Name of the target to be extracted from `target_list` and saved in the label pickle.

    target_categorical: bool, optional (default = `True`)
        If `True`, the target to extract is assumed to be categorical, i.e. for classification.\n
        If `False`, the target to extract is assumed to be continuous, i.e. for regression.\n
        If `True`, the labels are discretized according to `disc_type`.

    disc_type: { 'uniform', 'quantiles'}
        Type of discretization used if target is categorical. In both case, `n_bins` are used.
        See also :py:mod:`ai4materials.utils.utils_data_retrieval.extract_labels`.

    n_bins: int, optional (default=100)
        Number of bins used in the discretization.

    configs: dict
        Dictionary containing configuration information such as folders for input and output \n
        (e.g. `desc_folder`, `tmp_folder`), logging level, and metadata location.\n
        See also :py:mod:`ai4materials.utils.utils_config.set_configs`.

    dataset_folder: str, optional (default = `configs['io']['dataset_folder']`)
        Path to the folder where the dataset (two pickles with feature matrix and labels, \n
        plus a summary file in human-readable format) is saved.

    desc_folder: str, optional (default = `configs['io']['desc_folder']`)
        Path to the descriptor folder.

    tmp_folder: str, optional (default = `configs['io']['tmp_folder']`)
        Path to the tmp folder.

    main_folder: str, optional (default = `configs['io']['main_folder']`)
        Path to the main_folder.

    notes: str
        Notes/comments regarding the dataset that will be written in the dataset summary file.

    new_labels: dict, optional (default = `None`)
        It allows to substitute the label names that are in `target_list`. \n
        For example: \n
        new_labels = {"hcp": ["194"], "fcc": ["225"], "diam": ["227"], "bcc": ["229"]} \n
        will substitute each occurrence of "194" with "hcp" in the label list which is extracted. \n
        See also :py:mod:`ai4materials.utils.utils_data_retrieval.extract_labels`.

    Returns:

    str, str, str
        Return the path to the feature matrix pickle (numpy.ndarray), the label pickle (numpy.ndarray), \n
        and the human-readable summary file.\n
        This can be read by :py:mod:`ai4materials.preprocessing.load_dataset_from_file`.

    .. seealso:: modules :py:mod:`ai4materials.preprocessing.load_dataset_from_file`, \n
                         :py:mod:`ai4materials.wrappers.load_descriptor`

    .. codeauthor:: Angelo Ziletti <*****@*****.**>

    """

    configs = overwrite_configs(configs,
                                dataset_folder=dataset_folder,
                                desc_folder=desc_folder,
                                main_folder=main_folder,
                                tmp_folder=tmp_folder)

    dataset_folder = configs['io']['dataset_folder']

    data_set, nb_classes, label_encoder, numerical_labels, text_labels = merge_labels_data(
        structure_list=structure_list,
        target_list=target_list,
        desc_metadata=desc_metadata,
        target_categorical=target_categorical,
        one_hot=False,
        flatten_images=False,
        n_bins=n_bins,
        target_name=target_name,
        disc_type=disc_type,
        input_dims=input_dims,
        split_train_val=False,
        new_labels=new_labels)

    if not os.path.exists(dataset_folder):
        os.makedirs(dataset_folder)

    x_name = dataset_name + '_x'
    y_name = dataset_name + '_y'
    summary_name = dataset_name + '_summary'

    path_to_x = os.path.abspath(
        os.path.normpath(os.path.join(dataset_folder, x_name + '.pkl')))
    path_to_y = os.path.abspath(
        os.path.normpath(os.path.join(dataset_folder, y_name + '.pkl')))
    path_to_summary = os.path.abspath(
        os.path.normpath(os.path.join(dataset_folder, summary_name + '.json')))

    # write X and y to file
    with open(path_to_x, 'wb') as output:
        pickle.dump(data_set.images, output, pickle.HIGHEST_PROTOCOL)
        logger.info("Writing x to {0}".format(path_to_x))

    with open(path_to_y, 'wb') as output:
        pickle.dump(data_set.labels, output, pickle.HIGHEST_PROTOCOL)
        logger.info("Writing y to {0}".format(path_to_y))

    now = datetime.now()

    dataset_info = {
        "creation_date": str(now.isoformat()),
        "dataset_name": dataset_name,
        "target_name": target_name,
        "target_categorical": target_categorical,
        "disc_type": disc_type,
        "n_bins": n_bins,
        "path_to_x": path_to_x,
        "path_to_y": path_to_y,
        "path_to_summary": path_to_summary,
        "nb_classes": nb_classes,
        "classes": list(label_encoder.classes_),
        "numerical_labels": numerical_labels.tolist(),
        "text_labels": text_labels.tolist(),
        "notes": notes
    }

    # write summary file with main info about the dataset
    with open(path_to_summary, "w") as f:
        f.write("""
    {
          "data":[""")

        json.dump(dataset_info, f, indent=2)

        f.write("""
    ] }""")

        f.flush()

    logger.info('Summary file written in {0}.'.format(path_to_summary))

    return path_to_x, path_to_y, path_to_summary