예제 #1
0
def get_optimal_box_size(xyz_filename,
                         padding_ratio,
                         target_nb_atoms=128,
                         up_tolerance=80,
                         down_tolerance=30,
                         max_iter=100,
                         init_sliding_volume=None,
                         step=0.1,
                         cutoff_percentile=20):
    # initial guess
    sliding_volume = init_sliding_volume

    logger.info("Target nb_atoms: {}".format(target_nb_atoms))

    for idx in range(max_iter):
        logger.info("Iteration: {}".format(idx))

        xyz_boxes = get_boxes_from_xyz(xyz_filename,
                                       sliding_volume,
                                       stride_size=(6., 6., 20.),
                                       padding_ratio=padding_ratio)

        lengths = get_lengths_from_xyz_boxes(xyz_boxes).flatten()
        cutoff_nb_atoms = np.percentile(lengths, cutoff_percentile)

        # select all nearest neighbor distances larger than cutoff_nb_atoms
        threshold_indices = np.array(lengths) > cutoff_nb_atoms
        lengths = np.extract(threshold_indices, lengths)

        nb_atoms = np.percentile(lengths, 50)

        logger.info("Mode of the nb_atoms distribution {}".format(nb_atoms))
        logger.info("Sliding box {}".format(sliding_volume))
        logger.info("Diff {}".format(nb_atoms - target_nb_atoms))

        if nb_atoms - up_tolerance <= target_nb_atoms <= nb_atoms + down_tolerance:
            logger.info(
                "Mode of the nb_atoms distribution {}".format(nb_atoms))
            logger.info("nb_atoms - up_tolerance {}".format(nb_atoms -
                                                            up_tolerance))
            logger.info("nb_atoms + down_tolerance {}".format(nb_atoms +
                                                              down_tolerance))
            logger.info("Chosen sliding volume: {}".format(sliding_volume))
            break
        elif nb_atoms - target_nb_atoms <= 0:
            logger.debug(
                "Increasing sliding volume: {}".format(sliding_volume))
            sliding_volume = [item + step for item in sliding_volume]
        else:
            logger.debug(
                "Decreasing sliding volume: {}".format(sliding_volume))
            sliding_volume = [item - step for item in sliding_volume]

        del xyz_boxes

    return sliding_volume
def calc_local(geometry_files,
               box_size,
               stride,
               configs,
               padding_ratio=None,
               min_atoms=3,
               adjust_box_size_by_number_of_atoms=False,
               min_n_atoms=100,
               criterion='median',
               min_atoms_spm=50,
               model_file=None,
               path_to_summary_train=None,
               descriptor=None,
               mc_samples=1000,
               plot_results=False,
               desc_filename=None,
               nb_jobs=-1):
    """
    geometry_files: list
        list of geometry files

    box_size: list
        list of box size values (float) to be used for each geometry file.

    stride: list
        list of list of strides to be used for each geometry file.

    padding_ratio: list, optional (default=None)
        list of 1D lists, where each element specifies the
        amount of empty space (relative to the box size, i.e.,
        taking values in [0,1]) that is appended
        at the boundaries. Choosing this to a size
        of 0.5-1.0 typically suffices.
        For the default setting, a padding of 1.0 * box_size
        is used for each spatial dimension.

    min_atoms: int, optional (default=3)
        Minimum number of atoms contained in each box
        for which a descriptor will be calculated.

    adjust_box_size_by_number_of_atoms: boolean, optional (default=False)
        Determine if the box size is automatically tuned
        such that at least 'min_n_atoms' are contained in each box.
        The keyword 'criterion' fixes if the mean or the median of
        the number of atoms is at least 'min_n_atoms'.

    min_n_atoms: int,  optional (default=100)
        If adjust_box_size_by_number_of_atoms=True, this number is
        used to increase the box size until at least min_n_atoms
        atoms are contained in each box based on the criterion fixed
        via the keyword 'criterion'.

    criterion: string, optional (default='median')
        If adjust_box_size_by_number_of_atoms = True, the box size will
        be increased until at least min_n_atoms atoms are contained either
        according to the average (criterion='average') or the
        median (criterion='median').

    model: path to h5 file, optional (default=None)
        If None, then the model used in Leitherer et. al. 2021 will be used.

    descriptor: object, optional (default=None)
        If None, the quippy SOAP descriptor will be employed automatically
        with the standard settings used in Leitherer et. al. 2021.

    mc_samples: int, optional (default=1000)
        Number of Monte Carlo sampes to calculate uncertainty estimate.

    plot_results: boolean, optional (default=False)
        Decide wheter to automatically generate svg files for visual analysis.
        
    nb_jobs: int (default=-1)
        Number of CPUs used for parallel calculation.

    """
    if not desc_filename == None:
        if not (type(desc_filename) == list
                or len(desc_filename) == len(geometry_files)):
            raise ValueError(
                "If specify desc files, specifiy them as list containing at least len(geometry_files) entries."
            )

    if model_file == None:
        model_file = get_data_filename(
            'data/nn_models/AI_SYM_Leitherer_et_al_2021.h5')

    if len(geometry_files) == 0:
        raise ValueError(
            "No geometry files specified - or only passed as string and not as list."
        )

    parameters_to_check = {
        'stride': stride,
        'box_size': box_size,
        'padding_ratio': padding_ratio
    }
    if type(stride) == float or type(box_size) == float:
        raise ValueError(
            "Please specify stride and box size as list of floats.")

    for key in parameters_to_check:
        parameter = parameters_to_check[key]
        print('Test parameter {}'.format(key))
        if key == 'padding_ratio':
            if parameter == None:
                parameter = [[1.0, 1.0, 1.0]
                             for _ in range(len(geometry_files))]
                padding_ratio = parameter
        if not len(parameter) == len(geometry_files):
            raise ValueError(
                "Parameter {} needs to be list of same length as geometry_files."
                .format(key))
    strides = stride
    box_sizes = box_size
    padding_ratios = padding_ratio
    """
    if not type(box_size) == list:
        box_sizes = [float(box_size)]
    else:
        box_sizes = box_size
    if not type(stride) == list:
        strides = [[float(stride), float(stride), float(stride)]]
    elif type(stride) == list:
        strides = [[_, _, _] for _ in stride]
    else:
        strides = stride
    if not type(padding_ratio) == list:
        padding_ratios = [padding_ratio]
    else:
        padding_ratios = padding_ratio
    
    if padding_ratio==None:
        padding_ratios = [[1.0, 1.0, 1.0] for _ in range(len(geometry_files))]
    """

    base_folder = configs['io']['main_folder']
    structure_files = geometry_files

    predictions = []
    uncertainty = []
    #print(structure_files, strides, box_sizes, padding_ratios)
    geom_file_id = 0
    for structure_file, stride_size, box_size, padding_ratio in zip(
            structure_files, strides, box_sizes, padding_ratios):
        print('Structure file {}'.format(structure_file))
        appendix_to_folder = '_box_' + str(box_size) + '_stride_' + str(
            stride_size)

        # atoms scaling chosen automatically here to include the maximal information -> may provide that as
        # as an option in the future.
        atoms_scaling_cutoffs = [box_size, box_size * 2, box_size * 3]
        #atoms_scaling_cutoffs=[20.,30.,40.,50.]

        new_directory = os.path.join(
            base_folder,
            os.path.basename(structure_file)[:-4] + appendix_to_folder)
        if not os.path.exists(new_directory):
            os.makedirs(new_directory)
        else:
            """
            shutil.rmtree(new_directory)           #removes all the subdirectories! -> disabled for now.
            os.makedirs(new_directory)
            """
            run = 2
            while os.path.exists(new_directory + '_run_' + str(run)):
                run += 1
            new_directory = new_directory + '_run_' + str(run)
            os.makedirs(new_directory)
        main_folder = new_directory

        # read config file
        configs_new = set_configs(main_folder=main_folder)
        #logger_new = setup_logger(configs_new, level='INFO', display_configs=False)
        # setup folder and files   - need to check for future release
        # if all of this is necessary.
        checkpoint_dir = os.path.dirname(model_file)
        checkpoint_filename = os.path.basename(model_file)

        dataset_folder = os.path.abspath(
            os.path.normpath(os.path.join(main_folder, 'datasets')))
        conf_matrix_file = os.path.abspath(
            os.path.normpath(os.path.join(main_folder,
                                          'confusion_matrix.png')))
        results_file = os.path.abspath(
            os.path.normpath(os.path.join(main_folder, 'results.csv')))

        configs_new['io']['dataset_folder'] = dataset_folder

        if adjust_box_size_by_number_of_atoms:
            # In the future: refine this part: start from large box and large stride, then make it finer to get more reasonable
            # number of atoms, i.e., start with large box, also make it smaller if exceed the number of atoms!
            initial_box_size = 0
            box_size_step_size = 1
            max_spread = 10
            current_mean_natoms = 0
            current_spread = max_spread * 2
            counter = 0

            start_time = time.time()
            box_size = initial_box_size
            while current_mean_natoms < min_n_atoms:  # or current_spread>max_spread:
                counter += 1
                print("Iteration {}".format(counter))
                box_size += box_size_step_size
                boxes, number_of_atoms_xyz = get_boxes_from_xyz(
                    structure_file,
                    sliding_volume=[box_size, box_size, box_size],
                    stride_size=[4.0, 4.0, 4.0
                                 ],  #[box_size/4., box_size/4., box_size/4.],
                    give_atom_density=True,
                    plot_atom_density=False,
                    padding_ratio=[0.0, 0.0,
                                   0.0])  #, atom_density_filename=os.getcwd())

                current_mean_natoms = np.median(
                    np.array(number_of_atoms_xyz).flatten())
                current_spread = np.std(
                    np.array(number_of_atoms_xyz).flatten())
                print("Mean Natoms = {}, spread = {} ".format(
                    current_mean_natoms, current_spread))

            print("Final box size = {} with natoms mean = {} and spread = {}".
                  format(box_size, current_mean_natoms, current_spread))
            end_time = time.time()

            print("--- %s seconds ---" % (end_time - start_time))

        # adjust padding ratio for slab structures
        polycrystal_structure = read(structure_file, ':', 'xyz')[0]
        positions = polycrystal_structure.positions
        for dim in range(3):
            positions_current_dim = positions[:, dim]
            extension_current_dim = abs(
                max(positions_current_dim) - min(positions_current_dim))
            if extension_current_dim <= box_size:  # if thickness 20 A or smaller, adjust box size suitably such that only one
                # step is takken into that direction, plus no padding is used in that direction. # TODO : only stride adjusted, still be fine prob., but actuall box size should be adjusted???
                #stride_size[dim] = round(extension_current_dim*2) # gives trouble  if extension = 0.0
                padding_ratio[dim] = 0.0
        print("Final stride = {}, final padding ratio = {}".format(
            stride_size, padding_ratio))

        # Descriptor
        if descriptor == None:
            #p_b_c=False
            l_max = 6
            n_max = 9
            atom_sigma = 0.1
            cutoff = 4.0
            central_weight = 0.0
            constrain_nn_distances = False
            descriptor = quippy_SOAP_descriptor(
                configs=configs_new,
                p_b_c=False,
                cutoff=cutoff,
                l_max=l_max,
                n_max=n_max,
                atom_sigma=atom_sigma,
                central_weight=central_weight,
                average=True,
                average_over_permuations=False,
                number_averages=200,
                atoms_scaling='quantile_nn',
                atoms_scaling_cutoffs=atoms_scaling_cutoffs,
                extrinsic_scale_factor=1.0,
                n_Z=1,
                Z=1,
                n_species=1,
                species_Z=1,
                scale_element_sensitive=True,
                return_binary_descriptor=True,
                average_binary_descriptor=True,
                min_atoms=min_atoms,
                shape_soap=316,
                constrain_nn_distances=constrain_nn_distances)

        descriptor.configs = configs_new  # important! otherwise descriptors will be calculated in desc file of first geometry file

        save_file = open(
            os.path.join(
                main_folder,
                os.path.basename(structure_file)[:-4] + '_log_file.txt'), 'w')
        # comment if you have already calculated the descriptor for the .xyz file
        desc_filename_to_load = None
        if not desc_filename == None:
            desc_filename_to_load = desc_filename[geom_file_id]
            geom_file_id += 1

        start = time.time()
        path_to_x_test, path_to_y_test, path_to_summary_test, path_to_strided_pattern_pos = make_strided_pattern_matching_dataset(
            polycrystal_file=structure_file,
            descriptor=descriptor,
            desc_metadata='SOAP_descriptor',
            configs=configs_new,
            operations_on_structure=None,
            stride_size=stride_size,
            box_size=box_size,
            init_sliding_volume=None,
            desc_file=desc_filename_to_load,
            desc_only=False,
            show_plot_lengths=False,
            desc_file_suffix_name='',
            nb_jobs=nb_jobs,
            padding_ratio=padding_ratio,
            min_nb_atoms=min_atoms_spm)  #min_atoms)
        end = time.time()
        ex_time = str(end - start)
        print('Execution time descriptor calculation: ' + ex_time)
        #print(path_to_x_test)
        #print(path_to_y_test)
        #print(path_to_summary_test)
        #print(path_to_strided_pattern_pos)
        save_file.write('Runtime crystal' + structure_file + ' ' + ex_time)

        # copy soap information into dataset folder (need to find more elegant way in the future)
        #shift_training_data_to_different_path(configs_new['io']['dataset_folder'])
        configs_new['io']['polycrystal_file'] = os.path.basename(
            structure_file)

        start = time.time()
        get_classification_map(configs_new,
                               path_to_x_test,
                               path_to_y_test,
                               path_to_summary_test,
                               path_to_strided_pattern_pos,
                               checkpoint_dir,
                               checkpoint_filename=checkpoint_filename,
                               mc_samples=mc_samples,
                               interpolation='none',
                               results_file=None,
                               calc_uncertainty=True,
                               conf_matrix_file=conf_matrix_file,
                               train_set_name='soap_pristine_data',
                               cmap_uncertainty='hot',
                               interpolation_uncertainty='none',
                               plot_results=plot_results,
                               path_to_summary_train=path_to_summary_train)
        end = time.time()
        prediction_str = 'Time for predicting ' + str(end - start) + ' s \n'
        save_file.write(prediction_str)
        save_file.write('Box size ' + str(box_size) + ', stride_size ' +
                        str(stride_size) + ' padding_ratio ' +
                        str(padding_ratio) + ' min_atoms for quippy: ' +
                        str(min_atoms) + ' minatoms SPM ' +
                        str(min_atoms_spm) + ' cutoff_for_scaling ' +
                        str(atoms_scaling_cutoffs))
        save_file.close()

        # load and append predictions and uncertainty
        prediction = np.load(
            os.path.join(
                configs_new['io']['results_folder'],
                configs_new['io']['polycrystal_file'] + '_probabilities.npy'))
        predictions.append(prediction)

        uncertainty_dict = {
            'mutual_information': [],
            'variation_ratio': [],
            'predictive_entropy': []
        }
        for key in uncertainty_dict:
            uncertainty_ = np.load(
                os.path.join(
                    configs_new['io']['results_folder'],
                    configs_new['io']['polycrystal_file'] + '_' + key +
                    '.npy'))
            uncertainty_dict[key] = uncertainty_
        uncertainty.append(uncertainty_dict)

        print('Clean tmp folder')
        clean_folder(configs_new['io']['tmp_folder'],
                     endings_to_delete=(".png", ".npy", "_target.json",
                                        "_aims.in", "_ase_atoms_info.pkl",
                                        "_ase_atoms.json", "_coord.in"))

    return predictions, uncertainty
예제 #3
0
from ai4materials.utils.utils_crystals import get_boxes_from_xyz

import os, time


filename = 'polycrystal.xyz'

frame_size=[112.0, 112.0, 80.0]
sliding_volume=[14.0, 14.0, 80.0]
stride_size=[14.0, 14.0, 80.0]
start_time = time.time()
boxes, number_of_atoms_xyz = get_boxes_from_xyz(filename,
                                                frame_size=frame_size,
                                                sliding_volume=sliding_volume,
                                                stride_size= stride_size,
                                                adapt=False,
                                                give_atom_density=True,
                                                plot_atom_density=False,
                                                padding_ratio=0.0)

y_grid_max = len(boxes[0])
x_grid_max = len(boxes[0][0])


pixelSizeX = 0.1
pixelSizeY = 0.1

nX = 32
nY = 32

    output_filename = 'example'

    output_filename_path = os.path.abspath(
        os.path.normpath(os.path.join(main_folder, output_filename)))

    box_size = [80.0, 80.0, 15.0]
    # four grains
    grain_specifications = [[[40.0, 40.0, 10.56], ['Al', 'fcc', 4.046]],
                            [[0.45, 2.15, 10.56], ['Co', 'hcp', 2.507, 4.067]],
                            [[10.3, 18.1, 10.56], ['C', 'diamond', 3.571]],
                            [[18.2, 5.21, 10.56], ['Fe', 'bcc', 2.856]]]

    # this requires atomsk installed (http://atomsk.univ-lille1.fr/)
    generate_polycrystal(output_filename, box_size, grain_specifications)

    ########################
    # Get boxes
    ########################

    sliding_volume = [10.0, 10.0, 15.0]
    stride_size = [8.0, 8.0, 15.0]
    xyz_boxes = get_boxes_from_xyz(output_filename + '.xyz', sliding_volume,
                                   stride_size)

    boxes = np.asarray(xyz_boxes)

    print("Boxes.shape: {}".format(boxes.shape))

    #Visualization
    #os.system('ovito '+output_filename+'.xyz')
예제 #5
0
def get_structures_by_boxes(xyz_filename,
                            stride_size,
                            box_size,
                            show_plot_lengths=False,
                            padding_ratio=(1.0, 1.0, 1.0),
                            init_sliding_volume=None):
    xyz_filename_without_ext, xyz_filename_extension = os.path.splitext(
        os.path.basename(xyz_filename))

    if box_size is not None:
        sliding_volume = [box_size, box_size, box_size]
    else:
        logger.info("Determining box_size automatically.")
        sliding_volume = get_optimal_box_size(
            xyz_filename,
            padding_ratio,
            target_nb_atoms=128,
            cutoff_percentile=20,
            init_sliding_volume=init_sliding_volume)

        assert sliding_volume[0] == sliding_volume[1]
        assert sliding_volume[1] == sliding_volume[2]
        box_size = sliding_volume[0]

    xyz_boxes, number_of_atoms_xyz = get_boxes_from_xyz(
        xyz_filename,
        sliding_volume,
        stride_size,
        padding_ratio=padding_ratio,
        give_atom_density=True,
        plot_atom_density=False)
    tot_nb_boxes = len(xyz_boxes) * len(xyz_boxes[0]) * len(xyz_boxes[0][0])

    logger.info("Box size: {}".format(box_size))
    logger.info("Stride size: {}".format(stride_size))
    logger.info("Numbers of boxes in x, y, z: {0} {1} {2}".format(
        len(xyz_boxes[0][0]), len(xyz_boxes[0]), len(xyz_boxes)))
    logger.info("Total numbers of boxes: {}".format(tot_nb_boxes))

    array_lengths = np.empty_like(xyz_boxes)

    ase_atoms_list = []
    for k in range(len(xyz_boxes)):
        for i in range(len(xyz_boxes[0])):
            for j in range(len(xyz_boxes[0][0])):
                ase_atoms = xyz_boxes[k][i][j]
                array_lengths[k, i, j] = len(ase_atoms)
                # add cell and label
                ase_atoms.set_cell(
                    np.array((box_size, box_size, box_size)) * np.identity(3))
                ase_atoms.set_cell(
                    np.array((box_size, box_size, box_size)) * np.identity(3))
                ase_atoms.info['label'] = xyz_filename_without_ext + '_' + str(
                    k) + '_' + str(i) + '_' + str(j)
                ase_atoms.info['strided_pattern_positions'] = np.asarray(
                    (k, i, j))

                ase_atoms_list.append(ase_atoms)

    if show_plot_lengths:
        for idx_slice in range(array_lengths.shape[0]):
            array_lengths_slice = array_lengths[idx_slice].astype(float)
            fig, ax = plt.subplots()
            cax = ax.imshow(array_lengths_slice,
                            interpolation='nearest',
                            cmap=plt.cm.afmhot,
                            origin='lower')
            ax.set_title(
                'Number of atoms in each box for slice {}'.format(idx_slice))
            fig.colorbar(cax)
            plt.show()

    return ase_atoms_list