def get_optimal_box_size(xyz_filename, padding_ratio, target_nb_atoms=128, up_tolerance=80, down_tolerance=30, max_iter=100, init_sliding_volume=None, step=0.1, cutoff_percentile=20): # initial guess sliding_volume = init_sliding_volume logger.info("Target nb_atoms: {}".format(target_nb_atoms)) for idx in range(max_iter): logger.info("Iteration: {}".format(idx)) xyz_boxes = get_boxes_from_xyz(xyz_filename, sliding_volume, stride_size=(6., 6., 20.), padding_ratio=padding_ratio) lengths = get_lengths_from_xyz_boxes(xyz_boxes).flatten() cutoff_nb_atoms = np.percentile(lengths, cutoff_percentile) # select all nearest neighbor distances larger than cutoff_nb_atoms threshold_indices = np.array(lengths) > cutoff_nb_atoms lengths = np.extract(threshold_indices, lengths) nb_atoms = np.percentile(lengths, 50) logger.info("Mode of the nb_atoms distribution {}".format(nb_atoms)) logger.info("Sliding box {}".format(sliding_volume)) logger.info("Diff {}".format(nb_atoms - target_nb_atoms)) if nb_atoms - up_tolerance <= target_nb_atoms <= nb_atoms + down_tolerance: logger.info( "Mode of the nb_atoms distribution {}".format(nb_atoms)) logger.info("nb_atoms - up_tolerance {}".format(nb_atoms - up_tolerance)) logger.info("nb_atoms + down_tolerance {}".format(nb_atoms + down_tolerance)) logger.info("Chosen sliding volume: {}".format(sliding_volume)) break elif nb_atoms - target_nb_atoms <= 0: logger.debug( "Increasing sliding volume: {}".format(sliding_volume)) sliding_volume = [item + step for item in sliding_volume] else: logger.debug( "Decreasing sliding volume: {}".format(sliding_volume)) sliding_volume = [item - step for item in sliding_volume] del xyz_boxes return sliding_volume
def calc_local(geometry_files, box_size, stride, configs, padding_ratio=None, min_atoms=3, adjust_box_size_by_number_of_atoms=False, min_n_atoms=100, criterion='median', min_atoms_spm=50, model_file=None, path_to_summary_train=None, descriptor=None, mc_samples=1000, plot_results=False, desc_filename=None, nb_jobs=-1): """ geometry_files: list list of geometry files box_size: list list of box size values (float) to be used for each geometry file. stride: list list of list of strides to be used for each geometry file. padding_ratio: list, optional (default=None) list of 1D lists, where each element specifies the amount of empty space (relative to the box size, i.e., taking values in [0,1]) that is appended at the boundaries. Choosing this to a size of 0.5-1.0 typically suffices. For the default setting, a padding of 1.0 * box_size is used for each spatial dimension. min_atoms: int, optional (default=3) Minimum number of atoms contained in each box for which a descriptor will be calculated. adjust_box_size_by_number_of_atoms: boolean, optional (default=False) Determine if the box size is automatically tuned such that at least 'min_n_atoms' are contained in each box. The keyword 'criterion' fixes if the mean or the median of the number of atoms is at least 'min_n_atoms'. min_n_atoms: int, optional (default=100) If adjust_box_size_by_number_of_atoms=True, this number is used to increase the box size until at least min_n_atoms atoms are contained in each box based on the criterion fixed via the keyword 'criterion'. criterion: string, optional (default='median') If adjust_box_size_by_number_of_atoms = True, the box size will be increased until at least min_n_atoms atoms are contained either according to the average (criterion='average') or the median (criterion='median'). model: path to h5 file, optional (default=None) If None, then the model used in Leitherer et. al. 2021 will be used. descriptor: object, optional (default=None) If None, the quippy SOAP descriptor will be employed automatically with the standard settings used in Leitherer et. al. 2021. mc_samples: int, optional (default=1000) Number of Monte Carlo sampes to calculate uncertainty estimate. plot_results: boolean, optional (default=False) Decide wheter to automatically generate svg files for visual analysis. nb_jobs: int (default=-1) Number of CPUs used for parallel calculation. """ if not desc_filename == None: if not (type(desc_filename) == list or len(desc_filename) == len(geometry_files)): raise ValueError( "If specify desc files, specifiy them as list containing at least len(geometry_files) entries." ) if model_file == None: model_file = get_data_filename( 'data/nn_models/AI_SYM_Leitherer_et_al_2021.h5') if len(geometry_files) == 0: raise ValueError( "No geometry files specified - or only passed as string and not as list." ) parameters_to_check = { 'stride': stride, 'box_size': box_size, 'padding_ratio': padding_ratio } if type(stride) == float or type(box_size) == float: raise ValueError( "Please specify stride and box size as list of floats.") for key in parameters_to_check: parameter = parameters_to_check[key] print('Test parameter {}'.format(key)) if key == 'padding_ratio': if parameter == None: parameter = [[1.0, 1.0, 1.0] for _ in range(len(geometry_files))] padding_ratio = parameter if not len(parameter) == len(geometry_files): raise ValueError( "Parameter {} needs to be list of same length as geometry_files." .format(key)) strides = stride box_sizes = box_size padding_ratios = padding_ratio """ if not type(box_size) == list: box_sizes = [float(box_size)] else: box_sizes = box_size if not type(stride) == list: strides = [[float(stride), float(stride), float(stride)]] elif type(stride) == list: strides = [[_, _, _] for _ in stride] else: strides = stride if not type(padding_ratio) == list: padding_ratios = [padding_ratio] else: padding_ratios = padding_ratio if padding_ratio==None: padding_ratios = [[1.0, 1.0, 1.0] for _ in range(len(geometry_files))] """ base_folder = configs['io']['main_folder'] structure_files = geometry_files predictions = [] uncertainty = [] #print(structure_files, strides, box_sizes, padding_ratios) geom_file_id = 0 for structure_file, stride_size, box_size, padding_ratio in zip( structure_files, strides, box_sizes, padding_ratios): print('Structure file {}'.format(structure_file)) appendix_to_folder = '_box_' + str(box_size) + '_stride_' + str( stride_size) # atoms scaling chosen automatically here to include the maximal information -> may provide that as # as an option in the future. atoms_scaling_cutoffs = [box_size, box_size * 2, box_size * 3] #atoms_scaling_cutoffs=[20.,30.,40.,50.] new_directory = os.path.join( base_folder, os.path.basename(structure_file)[:-4] + appendix_to_folder) if not os.path.exists(new_directory): os.makedirs(new_directory) else: """ shutil.rmtree(new_directory) #removes all the subdirectories! -> disabled for now. os.makedirs(new_directory) """ run = 2 while os.path.exists(new_directory + '_run_' + str(run)): run += 1 new_directory = new_directory + '_run_' + str(run) os.makedirs(new_directory) main_folder = new_directory # read config file configs_new = set_configs(main_folder=main_folder) #logger_new = setup_logger(configs_new, level='INFO', display_configs=False) # setup folder and files - need to check for future release # if all of this is necessary. checkpoint_dir = os.path.dirname(model_file) checkpoint_filename = os.path.basename(model_file) dataset_folder = os.path.abspath( os.path.normpath(os.path.join(main_folder, 'datasets'))) conf_matrix_file = os.path.abspath( os.path.normpath(os.path.join(main_folder, 'confusion_matrix.png'))) results_file = os.path.abspath( os.path.normpath(os.path.join(main_folder, 'results.csv'))) configs_new['io']['dataset_folder'] = dataset_folder if adjust_box_size_by_number_of_atoms: # In the future: refine this part: start from large box and large stride, then make it finer to get more reasonable # number of atoms, i.e., start with large box, also make it smaller if exceed the number of atoms! initial_box_size = 0 box_size_step_size = 1 max_spread = 10 current_mean_natoms = 0 current_spread = max_spread * 2 counter = 0 start_time = time.time() box_size = initial_box_size while current_mean_natoms < min_n_atoms: # or current_spread>max_spread: counter += 1 print("Iteration {}".format(counter)) box_size += box_size_step_size boxes, number_of_atoms_xyz = get_boxes_from_xyz( structure_file, sliding_volume=[box_size, box_size, box_size], stride_size=[4.0, 4.0, 4.0 ], #[box_size/4., box_size/4., box_size/4.], give_atom_density=True, plot_atom_density=False, padding_ratio=[0.0, 0.0, 0.0]) #, atom_density_filename=os.getcwd()) current_mean_natoms = np.median( np.array(number_of_atoms_xyz).flatten()) current_spread = np.std( np.array(number_of_atoms_xyz).flatten()) print("Mean Natoms = {}, spread = {} ".format( current_mean_natoms, current_spread)) print("Final box size = {} with natoms mean = {} and spread = {}". format(box_size, current_mean_natoms, current_spread)) end_time = time.time() print("--- %s seconds ---" % (end_time - start_time)) # adjust padding ratio for slab structures polycrystal_structure = read(structure_file, ':', 'xyz')[0] positions = polycrystal_structure.positions for dim in range(3): positions_current_dim = positions[:, dim] extension_current_dim = abs( max(positions_current_dim) - min(positions_current_dim)) if extension_current_dim <= box_size: # if thickness 20 A or smaller, adjust box size suitably such that only one # step is takken into that direction, plus no padding is used in that direction. # TODO : only stride adjusted, still be fine prob., but actuall box size should be adjusted??? #stride_size[dim] = round(extension_current_dim*2) # gives trouble if extension = 0.0 padding_ratio[dim] = 0.0 print("Final stride = {}, final padding ratio = {}".format( stride_size, padding_ratio)) # Descriptor if descriptor == None: #p_b_c=False l_max = 6 n_max = 9 atom_sigma = 0.1 cutoff = 4.0 central_weight = 0.0 constrain_nn_distances = False descriptor = quippy_SOAP_descriptor( configs=configs_new, p_b_c=False, cutoff=cutoff, l_max=l_max, n_max=n_max, atom_sigma=atom_sigma, central_weight=central_weight, average=True, average_over_permuations=False, number_averages=200, atoms_scaling='quantile_nn', atoms_scaling_cutoffs=atoms_scaling_cutoffs, extrinsic_scale_factor=1.0, n_Z=1, Z=1, n_species=1, species_Z=1, scale_element_sensitive=True, return_binary_descriptor=True, average_binary_descriptor=True, min_atoms=min_atoms, shape_soap=316, constrain_nn_distances=constrain_nn_distances) descriptor.configs = configs_new # important! otherwise descriptors will be calculated in desc file of first geometry file save_file = open( os.path.join( main_folder, os.path.basename(structure_file)[:-4] + '_log_file.txt'), 'w') # comment if you have already calculated the descriptor for the .xyz file desc_filename_to_load = None if not desc_filename == None: desc_filename_to_load = desc_filename[geom_file_id] geom_file_id += 1 start = time.time() path_to_x_test, path_to_y_test, path_to_summary_test, path_to_strided_pattern_pos = make_strided_pattern_matching_dataset( polycrystal_file=structure_file, descriptor=descriptor, desc_metadata='SOAP_descriptor', configs=configs_new, operations_on_structure=None, stride_size=stride_size, box_size=box_size, init_sliding_volume=None, desc_file=desc_filename_to_load, desc_only=False, show_plot_lengths=False, desc_file_suffix_name='', nb_jobs=nb_jobs, padding_ratio=padding_ratio, min_nb_atoms=min_atoms_spm) #min_atoms) end = time.time() ex_time = str(end - start) print('Execution time descriptor calculation: ' + ex_time) #print(path_to_x_test) #print(path_to_y_test) #print(path_to_summary_test) #print(path_to_strided_pattern_pos) save_file.write('Runtime crystal' + structure_file + ' ' + ex_time) # copy soap information into dataset folder (need to find more elegant way in the future) #shift_training_data_to_different_path(configs_new['io']['dataset_folder']) configs_new['io']['polycrystal_file'] = os.path.basename( structure_file) start = time.time() get_classification_map(configs_new, path_to_x_test, path_to_y_test, path_to_summary_test, path_to_strided_pattern_pos, checkpoint_dir, checkpoint_filename=checkpoint_filename, mc_samples=mc_samples, interpolation='none', results_file=None, calc_uncertainty=True, conf_matrix_file=conf_matrix_file, train_set_name='soap_pristine_data', cmap_uncertainty='hot', interpolation_uncertainty='none', plot_results=plot_results, path_to_summary_train=path_to_summary_train) end = time.time() prediction_str = 'Time for predicting ' + str(end - start) + ' s \n' save_file.write(prediction_str) save_file.write('Box size ' + str(box_size) + ', stride_size ' + str(stride_size) + ' padding_ratio ' + str(padding_ratio) + ' min_atoms for quippy: ' + str(min_atoms) + ' minatoms SPM ' + str(min_atoms_spm) + ' cutoff_for_scaling ' + str(atoms_scaling_cutoffs)) save_file.close() # load and append predictions and uncertainty prediction = np.load( os.path.join( configs_new['io']['results_folder'], configs_new['io']['polycrystal_file'] + '_probabilities.npy')) predictions.append(prediction) uncertainty_dict = { 'mutual_information': [], 'variation_ratio': [], 'predictive_entropy': [] } for key in uncertainty_dict: uncertainty_ = np.load( os.path.join( configs_new['io']['results_folder'], configs_new['io']['polycrystal_file'] + '_' + key + '.npy')) uncertainty_dict[key] = uncertainty_ uncertainty.append(uncertainty_dict) print('Clean tmp folder') clean_folder(configs_new['io']['tmp_folder'], endings_to_delete=(".png", ".npy", "_target.json", "_aims.in", "_ase_atoms_info.pkl", "_ase_atoms.json", "_coord.in")) return predictions, uncertainty
from ai4materials.utils.utils_crystals import get_boxes_from_xyz import os, time filename = 'polycrystal.xyz' frame_size=[112.0, 112.0, 80.0] sliding_volume=[14.0, 14.0, 80.0] stride_size=[14.0, 14.0, 80.0] start_time = time.time() boxes, number_of_atoms_xyz = get_boxes_from_xyz(filename, frame_size=frame_size, sliding_volume=sliding_volume, stride_size= stride_size, adapt=False, give_atom_density=True, plot_atom_density=False, padding_ratio=0.0) y_grid_max = len(boxes[0]) x_grid_max = len(boxes[0][0]) pixelSizeX = 0.1 pixelSizeY = 0.1 nX = 32 nY = 32
output_filename = 'example' output_filename_path = os.path.abspath( os.path.normpath(os.path.join(main_folder, output_filename))) box_size = [80.0, 80.0, 15.0] # four grains grain_specifications = [[[40.0, 40.0, 10.56], ['Al', 'fcc', 4.046]], [[0.45, 2.15, 10.56], ['Co', 'hcp', 2.507, 4.067]], [[10.3, 18.1, 10.56], ['C', 'diamond', 3.571]], [[18.2, 5.21, 10.56], ['Fe', 'bcc', 2.856]]] # this requires atomsk installed (http://atomsk.univ-lille1.fr/) generate_polycrystal(output_filename, box_size, grain_specifications) ######################## # Get boxes ######################## sliding_volume = [10.0, 10.0, 15.0] stride_size = [8.0, 8.0, 15.0] xyz_boxes = get_boxes_from_xyz(output_filename + '.xyz', sliding_volume, stride_size) boxes = np.asarray(xyz_boxes) print("Boxes.shape: {}".format(boxes.shape)) #Visualization #os.system('ovito '+output_filename+'.xyz')
def get_structures_by_boxes(xyz_filename, stride_size, box_size, show_plot_lengths=False, padding_ratio=(1.0, 1.0, 1.0), init_sliding_volume=None): xyz_filename_without_ext, xyz_filename_extension = os.path.splitext( os.path.basename(xyz_filename)) if box_size is not None: sliding_volume = [box_size, box_size, box_size] else: logger.info("Determining box_size automatically.") sliding_volume = get_optimal_box_size( xyz_filename, padding_ratio, target_nb_atoms=128, cutoff_percentile=20, init_sliding_volume=init_sliding_volume) assert sliding_volume[0] == sliding_volume[1] assert sliding_volume[1] == sliding_volume[2] box_size = sliding_volume[0] xyz_boxes, number_of_atoms_xyz = get_boxes_from_xyz( xyz_filename, sliding_volume, stride_size, padding_ratio=padding_ratio, give_atom_density=True, plot_atom_density=False) tot_nb_boxes = len(xyz_boxes) * len(xyz_boxes[0]) * len(xyz_boxes[0][0]) logger.info("Box size: {}".format(box_size)) logger.info("Stride size: {}".format(stride_size)) logger.info("Numbers of boxes in x, y, z: {0} {1} {2}".format( len(xyz_boxes[0][0]), len(xyz_boxes[0]), len(xyz_boxes))) logger.info("Total numbers of boxes: {}".format(tot_nb_boxes)) array_lengths = np.empty_like(xyz_boxes) ase_atoms_list = [] for k in range(len(xyz_boxes)): for i in range(len(xyz_boxes[0])): for j in range(len(xyz_boxes[0][0])): ase_atoms = xyz_boxes[k][i][j] array_lengths[k, i, j] = len(ase_atoms) # add cell and label ase_atoms.set_cell( np.array((box_size, box_size, box_size)) * np.identity(3)) ase_atoms.set_cell( np.array((box_size, box_size, box_size)) * np.identity(3)) ase_atoms.info['label'] = xyz_filename_without_ext + '_' + str( k) + '_' + str(i) + '_' + str(j) ase_atoms.info['strided_pattern_positions'] = np.asarray( (k, i, j)) ase_atoms_list.append(ase_atoms) if show_plot_lengths: for idx_slice in range(array_lengths.shape[0]): array_lengths_slice = array_lengths[idx_slice].astype(float) fig, ax = plt.subplots() cax = ax.imshow(array_lengths_slice, interpolation='nearest', cmap=plt.cm.afmhot, origin='lower') ax.set_title( 'Number of atoms in each box for slice {}'.format(idx_slice)) fig.colorbar(cax) plt.show() return ase_atoms_list