def load_datasets(dataset_folder):
    """Download the pristine dataset and the dataset with 25% vacancies from Dataverse."""

    train_set_name = 'pristine_dataset'
    # the IDs to retrieve the data can be found at this page:
    # https://dataverse.harvard.edu/api/datasets/export?exporter=dataverse_json&persistentId=doi%3A10.7910/DVN/ZDKBRF
    url_dataset_info_pristine = "https://dataverse.harvard.edu/api/access/datafile/3238706?format=original"
    url_x_pristine = "https://dataverse.harvard.edu/api/access/datafile/3238702?format=original"
    url_y_pristine = "https://dataverse.harvard.edu/api/access/datafile/3238704?format=original"
    path_to_x_pristine = os.path.join(dataset_folder,
                                      train_set_name + '_x.pkl')
    path_to_y_pristine = os.path.join(dataset_folder,
                                      train_set_name + '_y.pkl')
    path_to_summary_pristine = os.path.join(dataset_folder,
                                            train_set_name + '_summary.json')

    logger.info(
        "Downloading dataset of pristine structures from the Harvard Dataverse in {}."
        .format(dataset_folder))
    logger.info("Size: ~500MB. This may take a few minutes.")
    urllib.urlretrieve(url_x_pristine, path_to_x_pristine)
    urllib.urlretrieve(url_y_pristine, path_to_y_pristine)
    urllib.urlretrieve(url_dataset_info_pristine, path_to_summary_pristine)

    test_set_name = 'vac25_dataset'
    # the IDs to retrieve the data can be found at this page:
    # https://dataverse.harvard.edu/api/datasets/export?exporter=dataverse_json&persistentId=doi%3A10.7910/DVN/ZDKBRF
    url_dataset_info_vac25 = "https://dataverse.harvard.edu/api/access/datafile/3238706?format=original"
    url_x_vac25 = "https://dataverse.harvard.edu/api/access/datafile/3238702?format=original"
    # this is the same as the pristine labels because we assume that the defects we create do not change the class
    url_y_vac25 = "https://dataverse.harvard.edu/api/access/datafile/3238704?format=original"
    path_to_x_vac25 = os.path.join(dataset_folder, test_set_name + '_x.pkl')
    path_to_y_vac25 = os.path.join(dataset_folder, test_set_name + '_y.pkl')
    path_to_summary_vac25 = os.path.join(dataset_folder,
                                         test_set_name + '_summary.json')

    logger.info(
        "Downloading dataset of structures with 25% vacancies from the Harvard Dataverse in {}."
        .format(dataset_folder))
    logger.info("Size: ~500MB. This may take a few minutes.")
    urllib.urlretrieve(url_dataset_info_vac25, path_to_summary_vac25)
    urllib.urlretrieve(url_x_vac25, path_to_x_vac25)
    urllib.urlretrieve(url_y_vac25, path_to_y_vac25)
    logger.info("Download completed.")

    # load datasets
    x_pristine, y_pristine, dataset_info_pristine = load_dataset_from_file(
        path_to_x_pristine, path_to_y_pristine, path_to_summary_pristine)

    x_vac25, y_vac25, dataset_info_vac25 = load_dataset_from_file(
        path_to_x_vac25, path_to_y_vac25, path_to_summary_vac25)

    return x_pristine, y_pristine, dataset_info_pristine, x_vac25, y_vac25, dataset_info_vac25
Ejemplo n.º 2
0
# add as target the spacegroup (using spacegroup of the "parental" structure for the defective structure)
#targets = ['fcc111', 'fcc111', 'fcc110', 'fcc110', 'fcc100', 'fcc100']

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(targets_list)
labels_list = label_encoder.transform(targets_list)

print(images_list.shape)
print(labels_list.shape)

path_to_x, path_to_y, path_to_summary = prepare_dataset_STEM(
    images_list=images_list,
    labels_list=labels_list,
    desc_metadata='diffraction_2d_intensity',
    dataset_name='STEM_monocrystalline',
    target_name='target',
    target_categorical=True,
    input_dims=(64, 64),
    configs=configs,
    dataset_folder=dataset_folder,
    main_folder=configs['io']['main_folder'],
    desc_folder=configs['io']['desc_folder'],
    tmp_folder=configs['io']['tmp_folder'],
    notes="STEM Dataset with bcc, fcc, and hcp structures, pristine.")

x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x,
                                            path_to_y=path_to_y,
                                            path_to_summary=path_to_summary)

#print(x)
        os.path.normpath(
            os.path.join(dataset_folder, train_set_name + '_summary.json')))

    test_set_name = 'disp0.1_dataset'
    path_to_x_test = os.path.abspath(
        os.path.normpath(os.path.join(dataset_folder,
                                      test_set_name + '_x.pkl')))
    path_to_y_test = os.path.abspath(
        os.path.normpath(os.path.join(dataset_folder,
                                      test_set_name + '_y.pkl')))
    path_to_summary_test = os.path.abspath(
        os.path.normpath(
            os.path.join(dataset_folder, test_set_name + '_summary.json')))

    x_train, y_train, dataset_info_train = load_dataset_from_file(
        path_to_x=path_to_x_train,
        path_to_y=path_to_y_train,
        path_to_summary=path_to_summary_train)

    x_test, y_test, dataset_info_test = load_dataset_from_file(
        path_to_x=path_to_x_test,
        path_to_y=path_to_y_test,
        path_to_summary=path_to_summary_test)

    params_cnn = {
        "nb_classes": dataset_info_train["data"][0]["nb_classes"],
        "classes": dataset_info_train["data"][0]["classes"],
        # "checkpoint_filename": 'try_'+str(now.isoformat()),
        "checkpoint_filename": 'ziletti_et_2018_rgb',
        "batch_size": 32,
        "img_channels": 3
    }
Ejemplo n.º 4
0
# Download the dataset from the online repository and load it
# =============================================================================

#x_pristine, y_pristine, dataset_info_pristine, x_vac25, y_vac25, dataset_info_vac25 = load_datasets(dataset_folder)

train_set_name = 'STEM_monocrystalline_train'
path_to_x_pristine = os.path.join(dataset_folder, train_set_name + '_x.pkl')
path_to_y_pristine = os.path.join(dataset_folder, train_set_name + '_y.pkl')
path_to_summary_pristine = os.path.join(dataset_folder, train_set_name + '_summary.json')

test_set_name = 'STEM_monocrystalline_test'
path_to_x_vac25 = os.path.join(dataset_folder, test_set_name + '_x.pkl')
path_to_y_vac25 = os.path.join(dataset_folder, test_set_name + '_y.pkl')
path_to_summary_vac25 = os.path.join(dataset_folder, test_set_name + '_summary.json')

x_pristine, y_pristine, dataset_info_pristine = load_dataset_from_file(path_to_x_pristine, path_to_y_pristine,
                                                                       path_to_summary_pristine)

x_vac25, y_vac25, dataset_info_vac25 = load_dataset_from_file(path_to_x_vac25, path_to_y_vac25,
                                                              path_to_summary_vac25)

#print(x_pristine)

# =============================================================================
# Train the convolutional neural network
# =============================================================================

# load the convolutional neural network architecture from Ziletti et al., Nature Communications 9, pp. 2775 (2018)
#partial_model_architecture = partial(cnn_architecture_ai4STEM, conv2d_filters=[32, 32, 16, 16, 8, 8],#[32, 16, 8, 8, 16, 32], #32, 16, 16, 8, 8],
partial_model_architecture = partial(cnn_nature_comm_ziletti2018, conv2d_filters=[32, 32, 16, 16, 8, 8],#[32, 16, 8, 8, 16, 32], #32, 16, 16, 8, 8],
                                     kernel_sizes=[3, 3, 3, 3, 3, 3], max_pool_strides=[2, 2],
                                     hidden_layer_size=128)
Ejemplo n.º 5
0
def get_classification_map(configs,
                           path_to_x_test,
                           path_to_y_test,
                           path_to_summary_test,
                           path_to_strided_pattern_pos,
                           checkpoint_dir,
                           checkpoint_filename,
                           mc_samples=100,
                           interpolation='none',
                           results_file=None,
                           calc_uncertainty=True,
                           conf_matrix_file=None,
                           train_set_name='hcp-bcc-sc-diam-fcc-pristine',
                           cmap_uncertainty='hot',
                           interpolation_uncertainty='none',
                           plot_results=False,
                           path_to_summary_train=None):

    if path_to_summary_train == None:
        path_to_x, path_to_y, path_to_summary = get_training_data_paths()
        with open(path_to_summary, 'rb') as f:
            dataset_info_train = json.load(f)
    """
    path_to_x_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_x.pkl')
    path_to_y_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_y.pkl')
    path_to_summary_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_summary.json')

    x_train, y_train, dataset_info_train = load_dataset_from_file(path_to_x=path_to_x_train, path_to_y=path_to_y_train,
                                                                  path_to_summary=path_to_summary_train)
    """
    x_test, y_test, dataset_info_test = load_dataset_from_file(
        path_to_x=path_to_x_test,
        path_to_y=path_to_y_test,
        path_to_summary=path_to_summary_test)

    with open(path_to_strided_pattern_pos, 'rb') as input_spm_pos:
        strided_pattern_pos = pickle.load(input_spm_pos)

    logger.debug('Strided_pattern_positions-shape: {0}'.format(
        strided_pattern_pos.shape))

    params_cnn = {
        "nb_classes": dataset_info_train["data"][0]["nb_classes"],
        "classes": dataset_info_train["data"][0]["classes"],
        "batch_size": 32,
        "img_channels": 1
    }

    text_labels = np.asarray(dataset_info_test["data"][0]["text_labels"])
    numerical_labels = np.asarray(
        dataset_info_test["data"][0]["numerical_labels"])

    # get classe and numerical to text conversion for plotting below
    classes_text_labels = dataset_info_train["data"][0]['classes']
    numerical_to_text_label = dict(
        zip(range(len(classes_text_labels)), classes_text_labels))

    filename_no_ext = os.path.abspath(
        os.path.normpath(os.path.join(checkpoint_dir, checkpoint_filename)))

    model = load_model(filename_no_ext)

    results = predict(x_test,
                      y_test,
                      model=model,
                      configs=configs,
                      nb_classes=dataset_info_train["data"][0]["nb_classes"],
                      batch_size=params_cnn["batch_size"],
                      mc_samples=mc_samples,
                      conf_matrix_file=conf_matrix_file,
                      numerical_labels=numerical_labels,
                      text_labels=text_labels,
                      results_file=results_file)

    predictive_mean = results['prob_predictions']
    uncertainty = results['uncertainty']

    class_plot_pos = np.asarray(strided_pattern_pos)
    (z_max, y_max, x_max) = np.amax(class_plot_pos, axis=0) + 1

    # make a dataframe to order the prob_predictions
    # this is needed when we read from file - the structures are ordered in a different way after they are saved
    # this comes into play only if more than 10 values for each directions are used
    df_positions = pd.DataFrame(data=class_plot_pos,
                                columns=[
                                    'strided_pattern_positions_z',
                                    'strided_pattern_positions_y',
                                    'strided_pattern_positions_x'
                                ])

    # sort predictive mean
    df_predictive_mean = pd.DataFrame(data=predictive_mean)
    #df = pd.concat([df_positions, df_predictive_mean], axis=1, join_axes=[df_positions.index])
    df = pd.concat([df_positions, df_predictive_mean], axis=1)
    df = df.reindex(df_positions.index)
    df_predictive_mean_sorted = df.sort_values([
        'strided_pattern_positions_z', 'strided_pattern_positions_y',
        'strided_pattern_positions_x'
    ],
                                               ascending=True)

    predictive_mean_sorted = df_predictive_mean_sorted.drop(columns=[
        'strided_pattern_positions_z', 'strided_pattern_positions_y',
        'strided_pattern_positions_x'
    ]).values

    predictive_mean_all_classes = []
    for idx_class in range(predictive_mean_sorted.shape[1]):

        if z_max == 1:
            prob_prediction_class = predictive_mean_sorted[:,
                                                           idx_class].reshape(
                                                               y_max, x_max)
        else:
            prob_prediction_class = predictive_mean_sorted[:,
                                                           idx_class].reshape(
                                                               z_max, y_max,
                                                               x_max)
        predictive_mean_all_classes.append(prob_prediction_class)
        title = 'Proto ' + numerical_to_text_label[idx_class] + ' Probability'
        if not plot_results:
            continue
        plot_prediction_heatmaps(prob_prediction_class,
                                 title=title,
                                 class_name=str(idx_class),
                                 prefix='prob',
                                 main_folder=configs['io']['main_folder'],
                                 cmap='viridis',
                                 color_nan='lightgrey',
                                 interpolation=interpolation,
                                 vmin=0.0,
                                 vmax=1.0)  # added vmin, vmax here

    np.save(
        os.path.join(configs['io']['results_folder'],
                     configs['io']['polycrystal_file'] + '_probabilities.npy'),
        np.array(predictive_mean_all_classes))

    if calc_uncertainty:
        df_uncertainty = pd.DataFrame()
        for key in uncertainty.keys():
            df_uncertainty[key] = uncertainty[key]

        #df = pd.concat([df_positions, df_uncertainty], axis=1, join_axes=[df_positions.index]) # TODO: join_axes deprecated..since version 0.25.0 - ai4mat at the moment has 0.22.0 BUT NOT ALWAYS
        df = pd.concat([df_positions, df_uncertainty], axis=1)
        df = df.reindex(df_positions.index)
        # if would do reinstallaiton, may fail... is not fixed in the setup folder!!
        df_uncertainty_sorted = df.sort_values([
            'strided_pattern_positions_z', 'strided_pattern_positions_y',
            'strided_pattern_positions_x'
        ],
                                               ascending=True)

        uncertainty_sorted = df_uncertainty_sorted.drop(columns=[
            'strided_pattern_positions_z', 'strided_pattern_positions_y',
            'strided_pattern_positions_x'
        ])

        for key in uncertainty.keys():
            if z_max == 1:
                # make two-dimensional plot
                uncertainty_prediction = uncertainty_sorted[
                    key].values.reshape(y_max, x_max)
            else:
                uncertainty_prediction = uncertainty_sorted[
                    key].values.reshape(z_max, y_max, x_max)

            np.save(
                os.path.join(
                    configs['io']['results_folder'],
                    configs['io']['polycrystal_file'] + '_' + key + '.npy'),
                uncertainty_prediction)
            if not plot_results:
                continue
            # for idx_uncertainty in range(predictive_mean_sorted.shape[1]):
            plot_prediction_heatmaps(uncertainty_prediction,
                                     title='Uncertainty ({})'.format(str(key)),
                                     main_folder=configs['io']['main_folder'],
                                     cmap=cmap_uncertainty,
                                     color_nan='lightgrey',
                                     prefix='uncertainty',
                                     suffix=str(key),
                                     interpolation=interpolation_uncertainty)
Ejemplo n.º 6
0
#dataset_folder = configs['io']['main_folder']
dataset_folder = os.path.join(configs['io']['main_folder'], 'my_datasets')

# =============================================================================
# Download the dataset from the online repository and load it
# =============================================================================

#x_pristine, y_pristine, dataset_info_pristine, x_vac25, y_vac25, dataset_info_vac25 = load_datasets(dataset_folder)

test_set_name = 'STEM_monocrystalline'
path_to_x_box = os.path.join(dataset_folder, test_set_name + '_x.pkl')
path_to_y_box = os.path.join(dataset_folder, test_set_name + '_y.pkl')
path_to_summary_box = os.path.join(dataset_folder,
                                   test_set_name + '_summary.json')

x_box, y_box, dataset_info_box = load_dataset_from_file(
    path_to_x_box, path_to_y_box, path_to_summary_box)

#print(x_pristine)

# =============================================================================
# Train the convolutional neural network
# =============================================================================

# load the convolutional neural network architecture from Ziletti et al., Nature Communications 9, pp. 2775 (2018)
#partial_model_architecture = partial(cnn_nature_comm_ziletti2018, conv2d_filters=[32, 32, 16, 16, 8, 8],
#                                     kernel_sizes=[3, 3, 3, 3, 3, 3], max_pool_strides=[2, 2],
#                                     hidden_layer_size=128)

# use x_train also for validation - this is only to run the test
#results = train_neural_network(x_train=x_pristine, y_train=y_pristine, x_val=x_pristine, y_val=y_pristine,
#                               configs=configs, partial_model_architecture=partial_model_architecture,
def get_classification_map(polycrystal_file,
                           descriptor,
                           desc_metadata,
                           configs,
                           checkpoint_dir,
                           checkpoint_filename,
                           operations_on_structure=None,
                           stride_size=(4., 4., 4.),
                           box_size=12.0,
                           init_sliding_volume=(14., 14., 14.),
                           desc_file=None,
                           desc_only=False,
                           show_plot_lengths=True,
                           calc_uncertainty=True,
                           mc_samples=10,
                           desc_file_suffix_name='_pristine',
                           nb_jobs=-1,
                           interpolation='none',
                           results_file=None,
                           conf_matrix_file=None,
                           train_set_name='hcp-bcc-sc-diam-fcc-pristine',
                           padding_ratio=None,
                           cmap_uncertainty='hot',
                           interpolation_uncertainty='none'):
    if desc_file is None:
        logger.info("Calculating system's representation.")
        desc_file = calc_polycrystal_desc(
            polycrystal_file,
            stride_size,
            box_size,
            descriptor,
            configs,
            desc_file_suffix_name,
            operations_on_structure,
            nb_jobs,
            show_plot_lengths,
            padding_ratio=padding_ratio,
            init_sliding_volume=init_sliding_volume)
    else:
        logger.info("Using the precomputed user-specified descriptor file.")

    if not desc_only:
        target_list, structure_list = load_descriptor(desc_files=desc_file,
                                                      configs=configs)

        # create dataset
        dataset_name = '{0}_stride_{1}_{2}_{3}_box_size_{4}_{5}.tar.gz'.format(
            polycrystal_file, stride_size[0], stride_size[1], stride_size[2],
            box_size, desc_file_suffix_name)

        path_to_x_test, path_to_y_test, path_to_summary_test = prepare_dataset(
            structure_list=structure_list,
            target_list=target_list,
            desc_metadata=desc_metadata,
            dataset_name=dataset_name,
            target_name='target',
            target_categorical=True,
            input_dims=(52, 32),
            configs=configs,
            dataset_folder=configs['io']['dataset_folder'],
            main_folder=configs['io']['main_folder'],
            desc_folder=configs['io']['desc_folder'],
            tmp_folder=configs['io']['tmp_folder'])

        path_to_x_train = os.path.join(configs['io']['dataset_folder'],
                                       train_set_name + '_x.pkl')
        path_to_y_train = os.path.join(configs['io']['dataset_folder'],
                                       train_set_name + '_y.pkl')
        path_to_summary_train = os.path.join(configs['io']['dataset_folder'],
                                             train_set_name + '_summary.json')

        x_train, y_train, dataset_info_train = load_dataset_from_file(
            path_to_x=path_to_x_train,
            path_to_y=path_to_y_train,
            path_to_summary=path_to_summary_train)

        x_test, y_test, dataset_info_test = load_dataset_from_file(
            path_to_x=path_to_x_test,
            path_to_y=path_to_y_test,
            path_to_summary=path_to_summary_test)

        params_cnn = {
            "nb_classes": dataset_info_train["data"][0]["nb_classes"],
            "classes": dataset_info_train["data"][0]["classes"],
            "batch_size": 32,
            "img_channels": 1
        }

        text_labels = np.asarray(dataset_info_test["data"][0]["text_labels"])
        numerical_labels = np.asarray(
            dataset_info_test["data"][0]["numerical_labels"])

        data_set_predict = make_data_sets(x_train_val=x_test,
                                          y_train_val=y_test,
                                          split_train_val=False,
                                          test_size=0.1,
                                          x_test=x_test,
                                          y_test=y_test)

        target_pred_class, target_pred_probs, prob_predictions, conf_matrix, uncertainty = predict(
            data_set_predict,
            params_cnn["nb_classes"],
            configs=configs,
            batch_size=params_cnn["batch_size"],
            checkpoint_dir=checkpoint_dir,
            checkpoint_filename=checkpoint_filename,
            show_model_acc=False,
            mc_samples=mc_samples,
            predict_probabilities=True,
            plot_conf_matrix=True,
            conf_matrix_file=conf_matrix_file,
            numerical_labels=numerical_labels,
            text_labels=text_labels,
            results_file=results_file,
            normalize=True)

        predictive_mean = prob_predictions

        # get the number of strides in each directions in order to reshape properly
        strided_pattern_positions = []
        for structure in structure_list:
            strided_pattern_positions.append(
                structure.info['strided_pattern_positions'])

        class_plot_pos = np.asarray(strided_pattern_positions)
        (z_max, y_max, x_max) = np.amax(class_plot_pos, axis=0) + 1

        # make a dataframe to order the prob_predictions
        # this is needed when we read from file - the structures are ordered in a different way after they are saved
        # this comes into play only if more than 10 values for each directions are used
        df_positions = pd.DataFrame(data=class_plot_pos,
                                    columns=[
                                        'strided_pattern_positions_z',
                                        'strided_pattern_positions_y',
                                        'strided_pattern_positions_x'
                                    ])

        # sort predictive mean
        df_predictive_mean = pd.DataFrame(data=predictive_mean)
        df = pd.concat([df_positions, df_predictive_mean],
                       axis=1,
                       join_axes=[df_positions.index])
        df_predictive_mean_sorted = df.sort_values([
            'strided_pattern_positions_z', 'strided_pattern_positions_y',
            'strided_pattern_positions_x'
        ],
                                                   ascending=True)

        predictive_mean_sorted = df_predictive_mean_sorted.drop(columns=[
            'strided_pattern_positions_z', 'strided_pattern_positions_y',
            'strided_pattern_positions_x'
        ]).values

        for idx_class in range(predictive_mean_sorted.shape[1]):

            prob_prediction_class = predictive_mean_sorted[:,
                                                           idx_class].reshape(
                                                               z_max, y_max,
                                                               x_max)

            plot_prediction_heatmaps(prob_prediction_class,
                                     title='Probability',
                                     class_name=str(idx_class),
                                     prefix='prob',
                                     main_folder=configs['io']['main_folder'],
                                     cmap='viridis',
                                     interpolation=interpolation)
            # mlab.close(all=True)
            # plt.contour3(prob_prediction_class)
            #(prob_prediction_class)
            # make three-dimensional plot

        if calc_uncertainty:
            df_uncertainty = pd.DataFrame()
            for key in uncertainty.keys():
                df_uncertainty[key] = uncertainty[key]

            df = pd.concat([df_positions, df_uncertainty],
                           axis=1,
                           join_axes=[df_positions.index])
            df_uncertainty_sorted = df.sort_values([
                'strided_pattern_positions_z', 'strided_pattern_positions_y',
                'strided_pattern_positions_x'
            ],
                                                   ascending=True)

            uncertainty_sorted = df_uncertainty_sorted.drop(columns=[
                'strided_pattern_positions_z', 'strided_pattern_positions_y',
                'strided_pattern_positions_x'
            ])

            for key in uncertainty.keys():
                uncertainty_prediction = uncertainty_sorted[
                    key].values.reshape(z_max, y_max, x_max)

                # for idx_uncertainty in range(predictive_mean_sorted.shape[1]):
                plot_prediction_heatmaps(
                    uncertainty_prediction,
                    title='Uncertainty ({})'.format(str(key)),
                    main_folder=configs['io']['main_folder'],
                    cmap=cmap_uncertainty,
                    prefix='uncertainty',
                    suffix=str(key),
                    interpolation=interpolation_uncertainty)