コード例 #1
0
def preprocess_imgs(input_folder, output_folder, resampling, img_names,
                    out_img_names, normalize_imgs, bias_corrections, fix_adc,
                    options, save_imgs: bool):
    """
    Generates preprocessesd images only, this can be used when making classifications of the test
    dataset of the NN.
    :param input_folder: Path to the DICOM files
    :param output_folder: Path to output nrrd files
    :param resampling: Array with 3 values indicating each resolution x,y,z
    :param img_names: Array Names of the images we want to transform
    :param out_img_names: Output img names
    :param normalize_imgs: Bool array Indicates if we need to perform percentile normalization to the images
    :param save_imgs: Bool indicates if we need to save or not the output images
    :return:
    """
    viz_ob = MedicalImageVisualizer()
    create_folder(output_folder)

    # ******************** READS DATA *******************
    print('\tReading data....')

    [orig_imgs,
     final_img_names] = read_dicom_mri_series(input_folder, img_names,
                                              out_img_names)

    # Saves original images without bias correction
    if save_imgs:
        write_itk_imgs(output_folder, 'img', orig_imgs, final_img_names)

    if len(fix_adc) == len(orig_imgs):
        for idx_fix_adc, c_fix_adc in enumerate(fix_adc):
            if c_fix_adc:
                print(
                    '\tFixing ADC, changing "black" values on original images ....'
                )
                orig_imgs[idx_fix_adc] = correct_adc_itk(
                    orig_imgs[idx_fix_adc])
    else:
        print('\tNone ADC img is being fixed')

    # ************** Normalize images (N4K bias correction) *************
    if options[PreprocParams.bias_correction]:
        print("\tBias correction.....")
        pretxt = 'img_n4k'
        for ii in range(len(orig_imgs)):
            if bias_corrections[ii]:
                # First try to read an existing file, if not, compute it
                orig_imgs[ii] = n4itk(orig_imgs[ii])
        # Saving bias corrected images
        if save_imgs:
            write_itk_imgs(output_folder, pretxt, orig_imgs, final_img_names)

    norm_perc = options[PreprocParams.normalize_percentiles]
    for idx_img in range(len(orig_imgs)):
        if normalize_imgs[idx_img]:
            print(F'\tNormalizing intensities ... {img_names[idx_img]}')
            orig_imgs[idx_img] = normalize_to_percentiles([orig_imgs[idx_img]],
                                                          norm_perc[0],
                                                          norm_perc[1])[0]

    # *********** Resample to [.5,.5,.5] and interpolate with optical flow ******************
    if options[PreprocParams.resample]:
        print("\tResampling .....")
        # viz_obj.plot_img_and_ctrs_itk(orig_imgs[0], slices=SliceMode.MIDDLE,title='Befor resampling')
        resampled_imgs, _ = reample_imgs_and_ctrs(orig_imgs, [], resampling)
        # viz_obj.plot_img_and_ctrs_itk(resampled_imgs[0], slices=SliceMode.MIDDLE,title='RESAMPLED')
        if save_imgs:
            write_itk_imgs(output_folder, 'hr', resampled_imgs,
                           final_img_names)

    # *********** Crop and normalize to 0 and 1 ************
    if options[PreprocParams.compute_roi_from_intersection]:
        print("\tCropping.....")
        roi_imgs, _, startROI_final, sizeROI_final = getCroppedIsotropicImgsOZ(
            resampled_imgs, [])

        # Saves the size and start position of the ROI, used when running the model
        np.savetxt(join(output_folder, 'start_ROI.csv'), startROI_final)
        np.savetxt(join(output_folder, 'size_ROI.csv'), sizeROI_final)
        # Save the roi images
        if save_imgs:
            write_itk_imgs(output_folder, 'roi', roi_imgs, final_img_names)

    return orig_imgs, resampled_imgs, roi_imgs
コード例 #2
0
def preprocess_imgs_and_ctrs(input_folder, output_folder, resampling,
                             img_names, ctr_names_orig, out_img_names,
                             out_ctr_names_orig, match_whole_w_ctr,
                             normalize_imgs, ctr_folder_names,
                             bias_corrections, fix_adc, options):
    """
    Generates nrrd files from dicom files. It does it for contours and series
    :param input_folder: Path to the DICOM files
    :param output_folder: Path to output nrrd files
    :param resampling: Array with 3 values indicating each resolution x,y,z
    :param img_names: Array Names of the images we want to transform
    :param ctr_names: Array Name of the contours we want to read
    :param out_img_names: Output img names
    :param out_ctr_names: Output ctr names
    :param match_whole_w_ctr: Bool array Indicate if we need to match the name of the contours exactly or as a RegEx
    :param normalize_imgs: Bool array Indicates if we need to perform percentile normalization to the images
    :param ctr_folder_names: Str Array With the name of the 'folder' names to search for contours
    :return:
    """
    viz_obj = MedicalImageVisualizer()

    ctr_names = ctr_names_orig.copy(
    )  # Patch to avoid problems with global variable
    out_ctr_names = out_ctr_names_orig.copy(
    )  # Patch to avoid problems with global variable

    create_folder(output_folder)

    # ******************** READS DATA *******************
    print('\tReading data....')

    [orig_imgs,
     final_img_names] = read_dicom_mri_series(input_folder, img_names,
                                              out_img_names)

    [orig_ctrs, final_ctr_names
     ] = read_rtstruct_mri_series(input_folder,
                                  ctr_folder_names=ctr_folder_names,
                                  in_ctr_names=ctr_names,
                                  out_ctr_names=out_ctr_names,
                                  ref_img_itk=orig_imgs[0],
                                  match_whole_word=match_whole_w_ctr)

    # Saves original images without bias correction
    write_itk_imgs(output_folder, 'img', orig_imgs, final_img_names)
    write_itk_imgs(output_folder, 'ctr', orig_ctrs, final_ctr_names)

    # ************** Correcting ADC intensities  *************
    for idx_fix_adc, c_fix_adc in enumerate(fix_adc):
        if c_fix_adc:
            print(
                '\tFixing ADC, changing "black" values on original images ....'
            )
            orig_imgs[idx_fix_adc] = correct_adc_itk(orig_imgs[idx_fix_adc])

    # ************** Normalize images (N4K bias correction) *************
    if options[PreprocParams.bias_correction]:
        print("\tBias correction.....")
        pretxt = 'img_n4k'
        for ii in range(len(orig_imgs)):
            if bias_corrections[ii]:
                # First try to read an existing file, if not, compute it
                file_name = join(
                    output_folder, '{}_{}.nrrd'.format(pretxt,
                                                       final_img_names[ii]))
                if exists(file_name):
                    print('\t\tReading previous n4k file...')
                    orig_imgs[ii] = sitk.ReadImage(file_name)
                else:
                    orig_imgs[ii] = n4itk(orig_imgs[ii])
        # Saving bias corrected images
        write_itk_imgs(output_folder, pretxt, orig_imgs, final_img_names)

    norm_perc = options[PreprocParams.normalize_percentiles]
    for idx_img in range(len(orig_imgs)):
        if normalize_imgs[idx_img]:
            print(F'\tNormalizing intensities ... {img_names[idx_img]}')
            orig_imgs[idx_img] = normalize_to_percentiles([orig_imgs[idx_img]],
                                                          norm_perc[0],
                                                          norm_perc[1])[0]

    # *********** Resample to [.5,.5,.5] and interpolate with optical flow ******************
    if options[PreprocParams.resample]:
        print("\tResampling .....")
        # viz_obj.plot_img_and_ctrs_itk(orig_imgs[0], orig_ctrs, slices=SliceMode.MIDDLE, title='Befor resampling')
        resampled_imgs, resampled_ctrs = reample_imgs_and_ctrs(
            orig_imgs, orig_ctrs, resampling)
        # viz_obj.plot_img_and_ctrs_itk(resampled_imgs[0], resampled_ctrs, slices=SliceMode.MIDDLE,title='RESAMPLED')
        if options[PreprocParams.optical_flow_ctr_interpolation]:
            print('\t\tOptical flow ....')
            resampled_ctrs = optical_flow_interpolation(resampled_ctrs)
        write_itk_imgs(output_folder, 'hr', resampled_imgs, final_img_names)
        write_itk_imgs(output_folder, 'hr_ctr', resampled_ctrs,
                       final_ctr_names)

    # *********** Crop and normalize to 0 and 1 ************
    if options[PreprocParams.compute_roi_from_intersection]:
        print("\tCropping.....")
        roi_imgs, roi_ctrs, startROI_final, sizeROI_final = getCroppedIsotropicImgsOZ(
            resampled_imgs, resampled_ctrs)

        if options[PreprocParams.smooth_ctrs]:
            print("\t\tSmoothing ctrs.....")
            # viz_obj.plot_img_and_ctrs_itk(roi_ctrs[0], slices=SliceMode.MIDDLE,title='Before smoothing')
            roi_ctrs = smoothContours(roi_ctrs)
            # viz_obj.plot_img_and_ctrs_itk(roi_ctrs[0], slices=SliceMode.MIDDLE,title='After smoothing')

        # Saves the size and start position of the ROI, used when running the model
        np.savetxt(join(output_folder, 'start_ROI.csv'), startROI_final)
        np.savetxt(join(output_folder, 'size_ROI.csv'), sizeROI_final)
        # Save the roi images
        write_itk_imgs(output_folder, 'roi', roi_imgs, final_img_names)
        write_itk_imgs(output_folder, 'roi_ctr', roi_ctrs, final_ctr_names)

    # viz_obj.plot_imgs_and_ctrs_itk(roi_imgs, roi_ctrs, slices=SliceMode.MIDDLE, title='Final ROIs')
    print("DONE!!!!...")
コード例 #3
0
    loss_func = config[TrainingParams.loss_function]
    batch_size = config[TrainingParams.batch_size]
    epochs = config[TrainingParams.epochs]
    img_names = config[TrainingParams.image_file_names]
    model_name_user = config[TrainingParams.config_name]
    class_label_file_name = config[TrainingParams.class_label_file_name]
    optimizer = config[TrainingParams.optimizer]

    nn_input_size = config[ModelParams.INPUT_SIZE]
    model_type = config[ModelParams.MODEL]

    split_info_folder = join(output_folder, 'Splits')
    parameters_folder = join(output_folder, 'Parameters')
    weights_folder = join(output_folder, 'models')
    logs_folder = join(output_folder, 'logs')
    create_folder(split_info_folder)
    create_folder(parameters_folder)
    create_folder(weights_folder)
    create_folder(logs_folder)

    folders_to_read = select_cases_from_folder(input_folder,
                                               config[TrainingParams.cases])
    tot_examples = len(folders_to_read)

    # ================ Split definition =================
    [train_ids, val_ids, test_ids
     ] = utilsNN.split_train_validation_and_test(tot_examples,
                                                 val_percentage=val_perc,
                                                 test_percentage=test_perc)

    print("Train examples (total:{}) :{}".format(len(train_ids),
コード例 #4
0
def main():
    config = get_makeprediction_config()
    # *********** Reads the parameters ***********

    input_file = config[ClassificationParams.input_file]
    splits_file = config[ClassificationParams.split_file]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    run_name = config[TrainingParams.config_name]
    model_weights_file = config[ClassificationParams.model_weights_file]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]
    disp_images = config[ClassificationParams.show_imgs]
    generate_images = config[ClassificationParams.generate_images]
    metrics_user = config[ClassificationParams.metrics]
    filter_stations = config[LocalTrainingParams.stations]

    # Iterate over the stations
    # Selects the proper model file for the current station
    assert len(model_weights_file) > 0
    assert len(input_file) > 0

    print(F"Working with: {model_weights_file} \n and \n {input_file}")

    data = pd.read_csv(input_file, index_col=0, parse_dates=True)

    all_data_cols = data.columns
    date_columns = [
        x for x in all_data_cols if (x.find('week') != -1) or (
            x.find('hour') != -1) or (x.find('year') != -1)
    ]
    stations_columns = [
        x for x in all_data_cols
        if (x.find('h') == -1) and (x not in date_columns)
    ]
    meteo_columns = [
        x for x in all_data_cols if (x.find('h') != -1) and (
            x not in date_columns) and (x not in stations_columns)
    ]
    desired_columns = meteo_columns + filter_stations + date_columns

    print("Appending date hot vector...")
    date_hv = generate_date_hot_vector(data.index)
    data = pd.concat([data[desired_columns], date_hv], axis=1)
    print("Done!")

    # print("Filtering data to hours 9 to 20...")
    filtered_data = data.between_time("9:00", "20:00")
    # filtered_data = data
    datetimes_str = filtered_data.index.values
    # print("Done!")

    print(F'Normalizing and filtering data....')
    parameters_folder = join(dirname(output_folder), 'Training', 'Parameters')
    data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns = \
        normalizeAndFilterData(filtered_data, datetimes_str, forecasted_hours, output_folder=parameters_folder,
                               run_name=run_name, read_from_file=True)

    # ********* Filling nan values in the stations with the mean values of all the 'available' stations ********
    X_df = data_norm_df_final.loc[datetimes_str[accepted_times_idx]]
    Y_df = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns]

    # ********* Filling nan values in the stations with the mean values of all the 'available' stations ********
    # for cur_station in stations_columns:
    #     X_df[cur_station] = X_df[cur_station].fillna(X_df['MEAN'])
    #     Y_df[cur_station] = Y_df[cur_station].fillna(data_norm_df_final.loc[datetimes_str[y_times_idx]]['MEAN'])

    # X = data_norm_df_final.loc[datetimes_str[accepted_times_idx]].values
    # X_df = X_df.drop(columns=['MEAN'])
    X_df = X_df.drop(columns=stations_columns)
    X = X_df.values
    # Y = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns].values
    Y = Y_df.values

    config[ModelParams.INPUT_SIZE] = len(X_df.columns)
    print(F'X shape: {X.shape} Y shape: {Y.shape}')

    # *********** Chooses the proper model ***********
    print('Reading model ....')
    config[ModelParams.NUMBER_OF_OUTPUT_CLASSES] = Y.shape[1]
    model = select_1d_model(config)

    # *********** Chooses the proper model ***********
    print('Reading splits info....')
    if splits_file != '':  # In this case we do read the information
        split_info = pd.read_csv(splits_file, dtype=np.int16)
    else:
        split_info = pd.DataFrame({
            'train_ids': [],
            'validation_ids': [],
            'test_id': []
        })
        split_info['train_ids'] = range(Y.shape[0])

    # *********** Reads the weights***********
    print('Reading weights ....')
    model.load_weights(model_weights_file)

    # ************ Makes NN Prediction ********
    print('Making prediction ....')
    output_nn_all = model.predict(X, verbose=1)

    # ************ Saves raw results ********
    number_of_examples = 10
    if generate_images:
        img_viz = EOAImageVisualizer(output_folder=output_imgs_folder,
                                     disp_images=disp_images)

        Y[Y == -1] = np.nan  # So that we do not show the -1
        for c_example in range(number_of_examples):
            hours_to_plot = 24 * 3  # How many points to plot
            start_idx = np.random.randint(
                0, X.shape[0] - hours_to_plot - forecasted_hours)
            end_idx = start_idx + hours_to_plot
            create_folder(output_folder)
            create_folder(output_imgs_folder)
            for idx_station, cur_station in enumerate(filter_stations):
                img_viz.plot_1d_data_np(
                    datetimes_str[y_times_idx][start_idx:end_idx], [
                        Y[start_idx:end_idx, idx_station],
                        output_nn_all[start_idx:end_idx, idx_station]
                    ],
                    title=F'{cur_station}',
                    labels=['GT', 'NN'],
                    file_name_prefix=F'{cur_station}_{c_example}')

    # ************ Recovering original units********
    print('Recovering original units....')
    nn_df = pd.DataFrame(output_nn_all,
                         columns=stations_columns,
                         index=filtered_data.index[y_times_idx])
    nn_original_units = deNormalize(nn_df)
    Y_original = deNormalize(Y_df)

    # ************ Computing metrics********
    print('Computing metrics and saving predictions....')
    compute_metrics(Y_original, nn_original_units, metrics_user, split_info,
                    output_file_name, stations_columns)
コード例 #5
0
def make_3d_segmentation(config):
    """
    :param config:
    :return:
    """

    # *********** Reads the parameters ***********

    cases = config[ClassificationParams.cases]
    save_segmented_ctrs = config[ClassificationParams.save_segmented_ctrs]

    input_folder = config[ClassificationParams.input_folder]
    input_img_names = config[ClassificationParams.input_img_file_names]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    model_weights_file = config[ClassificationParams.model_weights_file]
    compute_metrics = config[ClassificationParams.compute_metrics]
    compute_original_resolution = config[
        ClassificationParams.compute_original_resolution]

    save_imgs = config[ClassificationParams.save_imgs]
    if save_imgs:
        save_imgs_planes = config[ClassificationParams.save_img_planes]
        save_imgs_slices = config[ClassificationParams.save_img_slices]

    # Builds the visualization object
    viz_obj = MedicalImageVisualizer(
        disp_images=config[ClassificationParams.show_imgs],
        output_folder=output_imgs_folder)

    if compute_metrics:
        output_ctr_file_names = config[
            ClassificationParams.output_ctr_file_names]
    else:
        output_ctr_file_names = []
    # *********** Chooses the proper model ***********
    print('Reading model ....')
    model = select_3d_model(config)

    # *********** Reads the weights***********
    print('Reading weights ....')
    model.load_weights(model_weights_file)

    examples = select_cases_from_folder(input_folder, cases)
    create_folder(output_imgs_folder)

    # *********** Makes a dataframe to contain the DSC information **********
    metrics_params = config[ClassificationParams.metrics]
    metrics_dict = {met.name: met.value for met in metrics_params}

    # Check if the output fiels already exist, in thtat case read the df from it.
    if os.path.exists(join(output_imgs_folder, output_file_name)):
        data = pd.read_csv(join(output_imgs_folder, output_file_name),
                           index_col=0)
    else:
        data_columns = list(metrics_dict.values())
        if compute_original_resolution:
            # In this case we add all the desired metrics, but append 'original at the beginnig'
            data_columns = {
                *data_columns,
                *[F'{ORIGINAL_TXT}_{col}' for col in data_columns]
            }
        data = DataFrame(index=examples, columns=data_columns)

    # *********** Iterates over each case *********
    segmentation_type = config[ClassificationParams.segmentation_type]
    for id_folder, current_folder in enumerate(examples):
        print(F'******* Computing folder {current_folder} ************')
        t0 = time.time()
        try:
            # -------------------- Reading data -------------
            print('\t Reading data....')
            # All these names are predefined, for any other 3d segmentation we will need to create a different configuration
            imgs_itk, ctrs_itk, size_roi, start_roi, _ = read_preproc_imgs_and_ctrs_itk(
                input_folder,
                folders_to_read=[current_folder],
                img_names=input_img_names,
                ctr_names=output_ctr_file_names)

            imgs_np = [sitk.GetArrayFromImage(c_img) for c_img in imgs_itk[0]
                       ]  # The 0 is because we read a single fold
            ctrs_np = [sitk.GetArrayFromImage(c_img) for c_img in ctrs_itk[0]]

            # If we want to visualize the input images
            # viz_obj.plot_imgs_and_ctrs_itk(imgs_itk[0], ctrs_itk=ctrs_itk[0])

            # ------------------- Making prediction -----------
            print('\t Making prediction....')
            input_array = format_for_nn_classification(imgs_np)
            output_nn_all = model.predict(input_array, verbose=1)
            output_nn_np = output_nn_all[0, :, :, :, 0]
            # For visualizing the output of the network
            # viz_obj.plot_img_and_ctrs_np(img_np=output_nn_np)

            # ------------------- Postprocessing -----------
            print('\t Postprocessing prediction....')
            threshold = .5
            output_nn_itk = copyItkImage(imgs_itk[0][0], output_nn_np)
            print(F'\t\t Threshold NN output to {threshold} ....')
            output_nn_itk = binaryThresholdImage(output_nn_itk, threshold)
            if segmentation_type == SegmentationTypes.PROSTATE or segmentation_type == SegmentationTypes.PZ:
                print(
                    F'\t\t Restricting to largest connected component only  ....'
                )
                output_nn_itk = getLargestConnectedComponents(output_nn_itk)
                output_nn_np = sitk.GetArrayViewFromImage(output_nn_itk)

            if compute_original_resolution:
                print('\t Recovering original resolution...')
                print('\t\t Reading original resolution images....')
                img_names = [
                    config[
                        ClassificationParams.resampled_resolution_image_name],
                    config[ClassificationParams.original_resolution_image_name]
                ]
                ctr_name = config[
                    ClassificationParams.original_resolution_ctr_name]
                imgs_itk_original_temp, ctrs_itk_original_temp, _, _, _ = read_preproc_imgs_and_ctrs_itk(
                    input_folder,
                    folders_to_read=[current_folder],
                    img_names=img_names,
                    ctr_names=[ctr_name])

                gt_ctr_original_itk = ctrs_itk_original_temp[0][
                    0]  # Retrieves the gt ctr at the original resolution
                img_original_resampled_itk = imgs_itk_original_temp[0][0]
                img_original_itk = imgs_itk_original_temp[0][1]
                print('\t\t Resampling to original....')
                output_nn_original_itk = recover_original_resolution(
                    roi_np=output_nn_np,
                    resampled_itk=img_original_resampled_itk,
                    original_itk=img_original_itk,
                    start_positions=start_roi[0],
                    size_roi=size_roi[0])
                output_nn_original_itk = binaryThresholdImage(
                    output_nn_original_itk, threshold)
                if segmentation_type == SegmentationTypes.PROSTATE or segmentation_type == SegmentationTypes.PZ:
                    print(
                        F'\t\t\t Restricting to largest connected component only  ....'
                    )
                    output_nn_original_itk = getLargestConnectedComponents(
                        output_nn_original_itk)
                    output_nn_original_np = sitk.GetArrayViewFromImage(
                        output_nn_original_itk)

            if save_segmented_ctrs:
                print('\t Saving Prediction...')
                create_folder(join(output_folder, current_folder))
                # TODO at some point we will need to see if we can output more than one ctr
                sitk.WriteImage(
                    output_nn_itk,
                    join(output_folder, current_folder,
                         output_ctr_file_names[0]))
                if compute_original_resolution:
                    sitk.WriteImage(
                        output_nn_original_itk,
                        join(output_folder, current_folder,
                             F'{ORIGINAL_TXT}_{output_ctr_file_names[0]}'))

            if compute_metrics:
                # Compute metrics
                print('\t Computing metrics....')
                for c_metric in metrics_params:  # Here we can add more metrics
                    if c_metric == ClassificationMetrics.DSC_3D:
                        metric_value = numpy_dice(output_nn_np, ctrs_np[0])
                        data.loc[current_folder][c_metric.value] = metric_value
                        print(F'\t\t ----- DSC: {metric_value:.3f} -----')
                        if compute_original_resolution:
                            metric_value = numpy_dice(
                                output_nn_original_np,
                                sitk.GetArrayViewFromImage(
                                    gt_ctr_original_itk))
                            data.loc[current_folder][
                                F'{ORIGINAL_TXT}_{c_metric.value}'] = metric_value
                            print(F'\t\t ----- DSC: {metric_value:.3f} -----')

                # Saving the results every 10 steps
                if id_folder % 10 == 0:
                    save_metrics_images(data,
                                        metric_names=list(
                                            metrics_dict.values()),
                                        viz_obj=viz_obj)
                    data.to_csv(join(output_folder, output_file_name))

            if save_imgs:
                print('\t Plotting images....')
                plot_intermediate_results(current_folder,
                                          data_columns,
                                          imgs_itk=imgs_itk[0],
                                          gt_ctr_itk=ctrs_itk[0][0],
                                          nn_ctr_itk=output_nn_itk,
                                          data=data,
                                          viz_obj=viz_obj,
                                          slices=save_imgs_slices,
                                          compute_metrics=compute_metrics)
                if compute_original_resolution:
                    plot_intermediate_results(
                        current_folder,
                        data_columns,
                        imgs_itk=[img_original_itk],
                        gt_ctr_itk=gt_ctr_original_itk,
                        nn_ctr_itk=output_nn_original_itk,
                        data=data,
                        viz_obj=viz_obj,
                        slices=save_imgs_slices,
                        compute_metrics=compute_metrics,
                        prefix_name=ORIGINAL_TXT)
        except Exception as e:
            print(
                "---------------------------- Failed {} error: {} ----------------"
                .format(current_folder, e))
        print(F'\t Done! Elapsed time {time.time()-t0:0.2f} seg')

    if compute_metrics:
        save_metrics_images(data,
                            metric_names=list(metrics_dict.values()),
                            viz_obj=viz_obj)
        data.to_csv(join(output_folder, output_file_name))
コード例 #6
0
def main():
    config = getTrainingParams()
    # =============== Read data and merge meteorological variables===============
    print("Reading data")
    pollutant = config[LocalTrainingParams.pollutant]

    input_folder = config[TrainingParams.input_folder]
    output_folder = config[TrainingParams.output_folder]

    val_perc = config[TrainingParams.validation_percentage]
    test_perc = config[TrainingParams.test_percentage]
    eval_metrics = config[TrainingParams.evaluation_metrics]
    loss_func = config[TrainingParams.loss_function]
    batch_size = config[TrainingParams.batch_size]
    epochs = config[TrainingParams.epochs]
    model_name_user = config[TrainingParams.config_name]
    optimizer = config[TrainingParams.optimizer]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]
    years = config[LocalTrainingParams.years]
    debugging = config[LocalTrainingParams.debug]
    filter_stations = config[LocalTrainingParams.stations]
    filter_dates = config[LocalTrainingParams.filter_dates]

    split_info_folder = join(output_folder, 'Splits')
    parameters_folder = join(output_folder, 'Parameters')
    weights_folder = join(output_folder, 'models')
    logs_folder = join(output_folder, 'logs')
    create_folder(split_info_folder)
    create_folder(parameters_folder)
    create_folder(weights_folder)
    create_folder(logs_folder)


    data = None
    for year in years:
        print(F"============ Reading data for {year}: {pollutant} -- AllStations ==========================")
        if debugging:
            db_file_name = join(input_folder, F"{year}_{pollutant}_AllStationsDebug.csv")
        else:
            db_file_name = join(input_folder, F"{year}_{pollutant}_AllStations.csv")

        temp = pd.read_csv(db_file_name, index_col=0, parse_dates=True)
        if data is None:
            all_data_cols = temp.columns
            date_columns = [x for x in all_data_cols if (x.find('week') != -1) or (x.find('hour') != -1) or (x.find('year') != -1)]
            stations_columns = [x for x in all_data_cols if (x.find('h') == -1) and (x not in date_columns)]
            meteo_columns = [x for x in all_data_cols if (x.find('h') != -1) and (x not in date_columns) and (x not in stations_columns)]
            desired_columns = meteo_columns + filter_stations + date_columns
            data = temp[desired_columns]
        else:
            data = data.append(temp[desired_columns])

    print("Appending date hot vector...")
    date_hv = generate_date_hot_vector(data.index)
    data = pd.concat([data, date_hv], axis=1)
    print("Done!")

    # ********** Restricting only data between the hours of 9 to 20 TODO hardoded *****
    if filter_dates:
        filtered_data = data.between_time("9:00", "20:00")
    else:
        filtered_data = data
    datetimes_str = filtered_data.index.values

    data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns =\
        normalizeAndFilterData(filtered_data, datetimes_str, forecasted_hours, output_folder=parameters_folder,
                               run_name=model_name_user, read_from_file=False)

    X_df = data_norm_df_final.loc[datetimes_str[accepted_times_idx]]
    Y_df = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns]

    # ********* Filling nan values in the stations with the mean values of all the 'available' stations ********
    # for cur_station in stations_columns:
    #     X_df[cur_station] = X_df[cur_station].fillna(X_df['MEAN'])
    #     Y_df[cur_station] = Y_df[cur_station].fillna(data_norm_df_final.loc[datetimes_str[y_times_idx]]['MEAN'])

    # X = data_norm_df_final.loc[datetimes_str[accepted_times_idx]].values
    # X_df = X_df.drop(columns=['MEAN'])
    X_df = X_df.drop(columns=stations_columns)
    X = X_df.values
    # Y = data_norm_df_final.loc[datetimes_str[y_times_idx]][stations_columns].values
    Y = Y_df.values

    # ****** Bootstrap everything above 60 ppts TODO hardoded
    idx_by_col = Y_df > 0.24
    idx_above = idx_by_col.any(axis=1)
    # butstrap_size = 5 # How many times are we repeating the 'high' values
    # for i in range(butstrap_size):
    Y = np.append(Y, Y[idx_above, :], axis=0)
    X = np.append(X, X[idx_above, :], axis=0)

    config[ModelParams.INPUT_SIZE] = len(X_df.columns)
    print(F'Data shape: {filtered_data.shape} Data axes {filtered_data.axes}')
    print(F'X shape: {X.shape} Y shape: {Y.shape}')

    tot_examples = X.shape[0]
    rows_to_read = np.arange(tot_examples)

    # ================ Split definition =================
    [train_ids, val_ids, test_ids] = utilsNN.split_train_validation_and_test(tot_examples,
                                                                             val_percentage=val_perc,
                                                                             test_percentage=test_perc)

    print("Train examples (total:{}) :{}".format(len(train_ids), rows_to_read[train_ids]))
    print("Validation examples (total:{}) :{}:".format(len(val_ids), rows_to_read[val_ids]))
    print("Test examples (total:{}) :{}".format(len(test_ids), rows_to_read[test_ids]))

    print("Selecting and generating the model....")
    now = datetime.utcnow().strftime("%Y_%m_%d_%H_%M")
    model_name = F'{model_name_user}_{now}_{pollutant}_AllStations'

    # ******************* Selecting the model **********************
    config[ModelParams.NUMBER_OF_OUTPUT_CLASSES] = Y.shape[1]
    print(F"Nomber of output variables {Y.shape[1]}")
    model = select_1d_model(config)
    plot_model(model, to_file=join(output_folder, F'{model_name}.png'), show_shapes=True)

    file_name_splits = join(split_info_folder, F'{model_name}.csv')
    utilsNN.save_splits(file_name_splits, train_ids, val_ids, test_ids)

    print("Getting callbacks ...")

    [logger, save_callback, stop_callback] = utilsNN.get_all_callbacks(model_name=model_name,
                                                                       early_stopping_func=F'val_{eval_metrics[0].__name__}',
                                                                       weights_folder=weights_folder,
                                                                       logs_folder=logs_folder)

    print("Compiling model ...")
    model.compile(loss=loss_func, optimizer=optimizer, metrics=eval_metrics)

    print("Training ...")
    # This part should be somehow separated, it will change for every project
    x_train = X[train_ids, :]
    y_train = Y[train_ids, :]
    x_val = X[val_ids, :]
    y_val = Y[val_ids, :]

    model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=(x_val, y_val),
                        shuffle=True,
                        callbacks=[logger, save_callback, stop_callback])
コード例 #7
0
def trainModel(config, cur_pollutant, cur_station):
    """Trying to separate things so that tf 'cleans' the memory """

    input_folder = config[TrainingParams.input_folder]
    output_folder = config[TrainingParams.output_folder]

    val_perc = config[TrainingParams.validation_percentage]
    test_perc = config[TrainingParams.test_percentage]
    eval_metrics = config[TrainingParams.evaluation_metrics]
    loss_func = config[TrainingParams.loss_function]
    batch_size = config[TrainingParams.batch_size]
    epochs = config[TrainingParams.epochs]
    model_name_user = config[TrainingParams.config_name]
    optimizer = config[TrainingParams.optimizer]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]

    split_info_folder = join(output_folder, 'Splits')
    parameters_folder = join(output_folder, 'Parameters')
    weights_folder = join(output_folder, 'models')
    logs_folder = join(output_folder, 'logs')
    imgs_folder = join(output_folder, 'imgs')
    create_folder(split_info_folder)
    create_folder(parameters_folder)
    create_folder(weights_folder)
    create_folder(logs_folder)

    viz_obj = EOAImageVisualizer(output_folder=imgs_folder, disp_images=False)

    print(
        F"============ Reading data for: {cur_pollutant} -- {cur_station} =========================="
    )
    db_file_name = join(input_folder, constants.merge_output_folder.value,
                        F"{cur_pollutant}_{cur_station}.csv")
    data = pd.read_csv(db_file_name, index_col=0)

    config[ModelParams.INPUT_SIZE] = len(data.columns)
    print(F'Data shape: {data.shape} Data axes {data.axes}')
    print("Done!")

    # Predicting for the next value after 24hrs (only one)
    print("Normalizing data....")
    datetimes_str = data.index.values
    datetimes = np.array([
        datetime.strptime(x, constants.datetime_format.value)
        for x in datetimes_str
    ])

    scaler = preprocessing.MinMaxScaler()
    scaler = scaler.fit(data)
    data_norm_np = scaler.transform(data)
    data_norm_df = DataFrame(data_norm_np,
                             columns=data.columns,
                             index=data.index)
    print(F'Done!')

    # Filtering only dates where there is data "forecasted hours after" (24 hrs after)
    print(F"\tBuilding X and Y ....")
    accepted_times_idx = []
    y_times_idx = []
    for i, c_datetime in enumerate(datetimes):
        forecasted_datetime = (c_datetime + timedelta(hours=forecasted_hours))
        if forecasted_datetime in datetimes:
            accepted_times_idx.append(i)
            y_times_idx.append(
                np.argwhere(forecasted_datetime == datetimes)[0][0])

    X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]]
    Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][cur_pollutant]
    X = X_df.values
    Y = Y_df.values

    print(F'X shape: {X.shape} Y shape: {Y.shape}')

    tot_examples = X.shape[0]
    rows_to_read = np.arange(tot_examples)

    # ================ Split definition =================
    [train_ids, val_ids, test_ids
     ] = utilsNN.split_train_validation_and_test(tot_examples,
                                                 val_percentage=val_perc,
                                                 test_percentage=test_perc)

    print("Train examples (total:{}) :{}".format(len(train_ids),
                                                 rows_to_read[train_ids]))
    print("Validation examples (total:{}) :{}:".format(len(val_ids),
                                                       rows_to_read[val_ids]))
    print("Test examples (total:{}) :{}".format(len(test_ids),
                                                rows_to_read[test_ids]))

    print("Selecting and generating the model....")
    now = datetime.utcnow().strftime("%Y_%m_%d_%H_%M")
    model_name = F'{model_name_user}_{now}_{cur_pollutant}_{cur_station}'

    # ******************* Selecting the model **********************
    model = select_1d_model(config)
    plot_model(model,
               to_file=join(output_folder, F'{model_name}.png'),
               show_shapes=True)

    print("Saving split information...")
    file_name_splits = join(split_info_folder, F'{model_name}.csv')
    info_splits = DataFrame({F'Train({len(train_ids)})': train_ids})
    info_splits[F'Validation({len(val_ids)})'] = 0
    info_splits[F'Validation({len(val_ids)})'][0:len(val_ids)] = val_ids
    info_splits[F'Test({len(test_ids)})'] = 0
    info_splits[F'Test({len(test_ids)})'][0:len(test_ids)] = test_ids
    info_splits.to_csv(file_name_splits, index=None)

    print(F"Norm params: {scaler.get_params()}")
    file_name_normparams = join(parameters_folder, F'{model_name}.txt')
    utilsNN.save_norm_params(file_name_normparams, NormParams.min_max, scaler)
    info_splits.to_csv(file_name_splits, index=None)

    print("Getting callbacks ...")

    [logger, save_callback, stop_callback] = utilsNN.get_all_callbacks(
        model_name=model_name,
        early_stopping_func=F'val_{eval_metrics[0].__name__}',
        weights_folder=weights_folder,
        logs_folder=logs_folder)

    print("Compiling model ...")
    model.compile(loss=loss_func, optimizer=optimizer, metrics=eval_metrics)

    print("Training ...")
    # This part should be somehow separated, it will change for every project
    x_train = X[train_ids, :]
    y_train = Y[train_ids]
    x_val = X[val_ids, :]
    y_val = Y[val_ids]
    x_test = X[test_ids, :]
    y_test = Y[test_ids]

    # Plotting some intermediate results
    import matplotlib.pyplot as plt
    size = 24 * 60  # Two months of data
    start = np.random.randint(0, len(data) - size)
    end = start + size
    plt.figure(figsize=[64, 8])
    x_plot = range(len(X_df.iloc[start:end].index.values))
    y_plot = X_df.iloc[start:end][cur_pollutant].values
    yy_plot = Y_df.iloc[start:end].values
    viz_obj.plot_1d_data_np(x_plot, [y_plot, yy_plot],
                            title=F"{cur_pollutant}_{cur_station}",
                            labels=['Current', 'Desired'],
                            wide_ratio=4,
                            file_name_prefix=F"{cur_pollutant}_{cur_station}")

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_val, y_val),
              shuffle=True,
              callbacks=[logger, save_callback, stop_callback])
コード例 #8
0
    def organize_folders(self):
        '''
        This is the main function that organizes the folders.
        :return:
        '''
        LUT = {}
        original_date_folders = os.listdir(self._src_folder)
        original_date_folders.sort()
        last_idx = self._start_idx
        prev_file_found = False

        create_folder(self._dst_folder)

        if self._search_previous_names:
            LUT, prev_csv_file, last_idx, prev_file_found = self.read_prev_lut_file()

        # Iterate over all the source folders (dates of MIM)
        for c_date_folder in original_date_folders:
            print('******************************* {} *************************************'.format(c_date_folder))
            all_patients_in_date = os.listdir(join(self._src_folder, c_date_folder))
            only_date = c_date_folder.replace('__Studies', '')

            # Iterate folders we are _searching (a list for each image type)
            for idx_kf, keep_folder in enumerate(self._orig_folder_names):
                isNotDCE = self._new_folder_names[idx_kf] != 'DCE'

                # Iterate the hierarchy of this folder (when we are looking for multiple folders)
                for c_keep_folder in keep_folder:
                    # Search the folders that match by hierarchy
                    matched_folders = [x for x in all_patients_in_date if not (re.search(c_keep_folder, x) is None)]

                    # Iterate over matched folders
                    for c_folder in matched_folders:
                        pid = '{}_{}'.format(c_folder[0:self._replace_chars], only_date)  # Get patient id
                        if pid in LUT:  # Verify we havent 'used' this patient
                            cidx = LUT[pid]['id']
                            if self._keep_original_names:
                                curr_dst_folder = c_folder[0:self._replace_chars]
                            else:
                                curr_dst_folder = join('%s-%04d' % (self._prefix_name, cidx))

                            # Patch for DCE
                            if isNotDCE:
                                # Assure this folder is NOT already there
                                if os.path.exists(join(self._dst_folder, curr_dst_folder)):
                                    check_folders = os.listdir(join(self._dst_folder, curr_dst_folder))
                                    if np.any([x.find(self._new_folder_names[idx_kf]) != -1 for x in check_folders]):
                                        continue  # In this case we matched a LOWER level folder (in the hierarchy)

                        else:  # pid is not in LUT
                            if self._keep_original_names:
                                curr_dst_folder = c_folder[0:self._replace_chars]
                            else:
                                curr_dst_folder = join('%s-%04d' % (self._prefix_name, last_idx))
                            LUT[pid] = {'Folder': join('%s-%04d' % (self._prefix_name, last_idx)), 'id': last_idx}
                            last_idx += 1

                        curr_patient = c_folder[self._replace_chars:]
                        # Take into account only the folders in 'self._orig_folder_names'
                        src = join(self._src_folder, c_date_folder, c_folder)
                        dst = join(self._dst_folder, curr_dst_folder, '{}_{}'.format(self._new_folder_names[idx_kf], curr_patient))

                        if (len(dst) > 150) and (system() == 'Windows'):
                            curr_patient = self.make_windows_path(dst, curr_patient)
                            dst = join(self._dst_folder, curr_dst_folder, '{}_{}'.format(self._new_folder_names[idx_kf], curr_patient))

                        print(F" -------------- \n {src} \n {dst}")
                        if os.path.exists(dst):
                            shutil.rmtree(dst)
                        shutil.copytree(src, dst)

        # Remove the previous LUT file and save the new one
        if prev_file_found: #
            os.remove(prev_csv_file)
        _lut_file_name = join(self._dst_folder, '{}_from_{}_to_{}.csv'.format(self._lut_file_name, self._start_idx, last_idx - 1))
        self.saveLUT(LUT)
コード例 #9
0
def main():
    config = get_makeprediction_config()
    # *********** Reads the parameters ***********

    input_file = config[ClassificationParams.input_file]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    model_weights_file = config[ClassificationParams.model_weights_file]
    forecasted_hours = config[LocalTrainingParams.forecasted_hours]
    pollutant = config[LocalTrainingParams.pollutant]

    # ********** Reading and preprocessing data *******
    _all_stations = [
        "ACO", "AJM", "AJU", "ARA", "ATI", "AZC", "BJU", "CAM", "CCA", "CES",
        "CFE", "CHO", "COR", "COY", "CUA", "CUI", "CUT", "DIC", "EAJ", "EDL",
        "FAC", "FAN", "GAM", "HAN", "HGM", "IBM", "IMP", "INN", "IZT", "LAA",
        "LAG", "LLA", "LOM", "LPR", "LVI", "MCM", "MER", "MGH", "MIN", "MON",
        "MPA", "NET", "NEZ", "PED", "PER", "PLA", "POT", "SAG", "SFE", "SHA",
        "SJA", "SNT", "SUR", "TAC", "TAH", "TAX", "TEC", "TLA", "TLI", "TPN",
        "UAX", "UIZ", "UNM", "VAL", "VIF", "XAL", "XCH"
    ]

    # Iterate over the stations
    models_folder = '/data/UNAM/Air_Pollution_Forecast/Data/Training/models'
    data_folder = '/data/UNAM/Air_Pollution_Forecast/Data/MergedDataCSV'
    for c_station in _all_stations:
        try:
            model_weights_file = [
                join(models_folder, x) for x in listdir(models_folder)
                if x.find(c_station) != -1
            ]
            input_file = [
                join(data_folder, x) for x in listdir(data_folder)
                if x.find(c_station) != -1
            ]
            # Selects the proper model file for the current station
            assert len(model_weights_file) > 0
            assert len(input_file) > 0

            print(F"Working with: {model_weights_file} and {input_file}")
            model_weights_file = model_weights_file[0]
            input_file = input_file[0]

            data = pd.read_csv(input_file, index_col=0)

            config[ModelParams.INPUT_SIZE] = len(data.columns)
            print(F'Data shape: {data.shape} Data axes {data.axes}')
            print("Done!")

            # Predicting for the next value after 24hrs (only one)
            print("Normalizing data....")
            datetimes_str = data.index.values
            datetimes = np.array([
                datetime.strptime(x, constants.datetime_format.value)
                for x in datetimes_str
            ])

            scaler = preprocessing.MinMaxScaler()
            scaler = scaler.fit(data)
            data_norm_np = scaler.transform(data)
            data_norm_df = DataFrame(data_norm_np,
                                     columns=data.columns,
                                     index=data.index)
            print(F'Done!')

            # Filtering only dates where there is data "forecasted hours after" (24 hrs after)
            print(F"\tBuilding X and Y ....")
            accepted_times_idx = []
            y_times_idx = []
            for i, c_datetime in enumerate(datetimes):
                forecasted_datetime = (c_datetime +
                                       timedelta(hours=forecasted_hours))
                if forecasted_datetime in datetimes:
                    accepted_times_idx.append(i)
                    y_times_idx.append(
                        np.argwhere(forecasted_datetime == datetimes)[0][0])

            X_df = data_norm_df.loc[datetimes_str[accepted_times_idx]]
            Y_df = data_norm_df.loc[datetimes_str[y_times_idx]][pollutant]
            X = X_df.values
            Y = Y_df.values

            print(F'X shape: {X.shape} Y shape: {Y.shape}')

            # *********** Chooses the proper model ***********
            print('Reading model ....')
            model = select_1d_model(config)

            # *********** Reads the weights***********
            print('Reading weights ....')
            model.load_weights(model_weights_file)

            create_folder(output_folder)
            create_folder(output_imgs_folder)

            # *********** Makes a dataframe to contain the DSC information **********
            metrics_params = config[ClassificationParams.metrics]
            metrics_dict = {met.name: met.value for met in metrics_params}

            # *********** Iterates over each case *********
            t0 = time.time()
            # -------------------- Reading data -------------
            output_nn_all = model.predict(X, verbose=1)

            # Plotting some intermediate results
            import matplotlib.pyplot as plt
            size = 24 * 60  # Two months of data
            start = np.random.randint(0, len(data) - size)
            end = start + size
            plt.figure(figsize=[64, 8])
            x_plot = range(len(Y))
            y_plot = Y
            yy_plot = Y_df.iloc[start:end].values
            viz_obj = EOAImageVisualizer(output_folder=output_imgs_folder,
                                         disp_images=False)
            plot_this_many = 24 * 60
            viz_obj.plot_1d_data_np(
                x_plot[0:plot_this_many],
                [y_plot[0:plot_this_many], output_nn_all[0:plot_this_many, 0]],
                title=F"{c_station} {pollutant}",
                labels=['Original', 'Forecasted'],
                wide_ratio=4,
                file_name_prefix=F"{pollutant}_{c_station}")

            print(F'\t Done! Elapsed time {time.time() - t0:0.2f} seg')

        except Exception as e:
            print(
                F"---------------------------- Failed {c_station} error: {e} ----------------"
            )
コード例 #10
0
def main():
    config = get_segmentation_2d_config()
    cases = config[ClassificationParams.cases]
    save_segmented_ctrs = config[ClassificationParams.save_segmented_ctrs]

    input_folder = config[ClassificationParams.input_folder]
    input_img_names = config[ClassificationParams.input_img_file_names]
    output_folder = config[ClassificationParams.output_folder]
    output_imgs_folder = config[ClassificationParams.output_imgs_folder]
    output_file_name = config[ClassificationParams.output_file_name]
    model_weights_file = config[ClassificationParams.model_weights_file]

    save_imgs = config[ClassificationParams.save_imgs]

    # Builds the visualization object
    viz_obj = MedicalImageVisualizer(
        disp_images=config[ClassificationParams.show_imgs],
        output_folder=output_imgs_folder)

    output_ctr_file_names = config[ClassificationParams.output_ctr_file_names]
    # *********** Chooses the proper model ***********
    print('Reading model ....')
    model = select_2d_model(config)

    # *********** Reads the weights***********
    print('Reading weights ....')
    model.load_weights(model_weights_file)

    examples = select_cases_from_folder(input_folder, cases)
    create_folder(output_imgs_folder)

    # *********** Makes a dataframe to contain the DSC information **********
    metrics_params = config[ClassificationParams.metrics]
    metrics_dict = {met.name: met.value for met in metrics_params}

    # Check if the output files already exist, in that case read the df from it.
    if os.path.exists(join(output_imgs_folder, output_file_name)):
        data = pd.read_csv(join(output_imgs_folder, output_file_name),
                           index_col=0)
    else:
        data_columns = list(metrics_dict.values())
        data = DataFrame(index=examples, columns=data_columns)

    # *********** Iterates over each case *********
    for id_folder, current_folder in enumerate(examples):
        print(F'******* Computing folder {current_folder} ************')
        t0 = time.time()
        try:
            # -------------------- Reading data -------------
            print('\t Reading data....')
            # All these names are predefined, for any other 3d segmentation we will need to create a different configuration
            all_imgs, all_ctrs, _, _ = read_preproc_imgs_and_ctrs_png(
                input_folder,
                folders_to_read=[current_folder],
                img_names=input_img_names,
                ctr_names=output_ctr_file_names)

            imgs_np = all_imgs[0]
            ctrs_lungs_np = all_ctrs[0][0].copy(
            )  # VERIFY THE ORDER IS THE SAME IN THE CONFIG FILE
            ctrs_lesion_np = all_ctrs[0][1].copy(
            )  # VERIFY THE ORDER IS THE SAME IN THE CONFIG FILE
            # If we want to visualize the input images
            # viz_obj.plot_imgs_and_ctrs_itk(img_np[0], ctrs_itk=ctrs_itk[0])

            # ------------------- Making prediction -----------
            print('\t Making prediction....')
            input_array = format_for_nn_classification(imgs_np)
            output_nn_all = model.predict(input_array, verbose=1)
            output_nn_np = output_nn_all[0, :, :, 0]
            output_nn_np[ctrs_lungs_np ==
                         0] = 0  # Making the prediction 0 outside the lungs
            # For visualizing the output of the network
            # viz_obj.plot_img_and_ctrs_np_2d(output_nn_np, np_ctrs=[], file_name_prefix=id_folder)

            # ------------------- Postprocessing -----------
            print('\t Postprocessing prediction....')
            threshold = .5
            print(F'\t\t Threshold NN output to {threshold} ....')
            output_nn_np[
                output_nn_np <=
                threshold] = 0  # Making the prediction 0 outside the lungs
            output_nn_np[
                output_nn_np >
                threshold] = 1  # Making the prediction 0 outside the lungs

            if save_segmented_ctrs:
                print('\t Saving Prediction...')
                create_folder(join(output_folder, current_folder))
                cv2.imwrite(
                    join(output_folder, current_folder,
                         output_ctr_file_names[0]),
                    cv2.convertScaleAbs(output_nn_np, alpha=(255.0)))

            # Compute metrics
            print('\t Computing metrics....')
            for c_metric in metrics_params:  # Here we can add more metrics
                if c_metric == ClassificationMetrics.DSC_2D:
                    metric_value = numpy_dice(output_nn_np, ctrs_lesion_np)
                    data.loc[current_folder][c_metric.value] = metric_value
                    print(F'\t\t ----- DSC: {metric_value:.3f} -----')

            # Saving the results every 10 steps
            if id_folder % 10 == 0:
                save_metrics_images(data,
                                    metric_names=list(metrics_dict.values()),
                                    viz_obj=viz_obj)
                data.to_csv(join(output_folder, output_file_name))

            if save_imgs:
                print('\t Plotting images....')
                plot_intermediate_results(current_folder,
                                          data_columns,
                                          img_np=imgs_np[0],
                                          gt_ctr_np=ctrs_lesion_np,
                                          nn_ctr_np=output_nn_np,
                                          data=data,
                                          viz_obj=viz_obj)

        except Exception as e:
            print(
                "---------------------------- Failed {} error: {} ----------------"
                .format(current_folder, e))
        print(F'\t Done! Elapsed time {time.time()-t0:0.2f} seg')

    save_metrics_images(data,
                        metric_names=list(metrics_dict.values()),
                        viz_obj=viz_obj)
    data.to_csv(join(output_folder, output_file_name))
コード例 #11
0
def normalizeAndFilterData(data,
                           datetimes_orig,
                           forecasted_hours,
                           output_folder='',
                           run_name='',
                           read_from_file=False):
    """
    This function normalizes de data and filters only the cases where we
    have the appropiate forecasted times. It also obtains the 'y' index
    :param data: All the data
    :param datetimes_str: An array of datetimes as strings which correspond to the index
    :param forecasted_hours: an integer representing the number of hours in advance we want to read
    :return:
    """
    # Predicting for the next value after 24hrs (only one)
    print("Normalizing data....")
    datetimes = np.array(datetimes_orig)

    all_data_cols = data.columns.values
    date_columns = [
        x for x in all_data_cols if (x.find('week') != -1) or (
            x.find('hour') != -1) or (x.find('year') != -1)
    ]
    stations_columns = [
        x for x in all_data_cols
        if (x.find('h') == -1) and (x not in date_columns)
    ]
    meteo_columns = [
        x for x in all_data_cols if (x.find('h') != -1) and (
            x not in date_columns) and (x not in stations_columns)
    ]

    # Normalizing meteorological variables
    # In this case we obtain the normalization values directly from the data
    # meteo_names = ['U10', 'V10', 'RAINC', 'T2', 'RAINNC', 'PBLH', 'SWDOWN', 'GLW']
    meteo_names = ['U10', 'V10', 'RAINC', 'T2', 'RAINNC', 'SWDOWN', 'GLW']
    if not (read_from_file):
        min_data = {}
        max_data = {}
        for cur_meteo in meteo_names:
            cur_meteo_cols = [
                x for x in meteo_columns if x.find(cur_meteo) != -1
            ]
            min_data[cur_meteo] = data[cur_meteo_cols].min().min()
            max_data[cur_meteo] = data[cur_meteo_cols].max().max()
        # ********* Saving normalization values for each variable ******
        create_folder(output_folder)
        pd.DataFrame(min_data, index=[1]).to_csv(
            join(output_folder, F'{run_name}_min_values.csv'))
        pd.DataFrame(max_data, index=[1]).to_csv(
            join(output_folder, F'{run_name}_max_values.csv'))
    else:  # In this case we obtain the normalization values from the provided file
        min_data = pd.read_csv(join(output_folder,
                                    F'{run_name}_min_values.csv'),
                               index_col=0)
        max_data = pd.read_csv(join(output_folder,
                                    F'{run_name}_max_values.csv'),
                               index_col=0)

    data_norm_df = data.copy()

    # Normalizing the meteorological variables
    for cur_meteo in meteo_names:
        cur_meteo_cols = [x for x in meteo_columns if x.find(cur_meteo) != -1]
        # The data structure is a little bit different when reading from the file
        if not (read_from_file):
            min_val = min_data[cur_meteo]
            max_val = max_data[cur_meteo]
        else:
            min_val = min_data[cur_meteo].values[0]
            max_val = max_data[cur_meteo].values[0]
        data_norm_df[cur_meteo_cols] = (data_norm_df[cur_meteo_cols] -
                                        min_val) / (max_val - min_val)

    # Normalizing the pollution variables
    data_norm_df[stations_columns] = (data_norm_df[stations_columns] -
                                      _min_value_ozone) / (_max_value_ozone -
                                                           _min_value_ozone)
    print(F'Done!')

    # Filtering only dates where there is data "forecasted hours after" (24 hrs after)
    print(F"Building X and Y ....")
    accepted_times_idx = []
    y_times_idx = []
    for i, c_datetime in enumerate(datetimes):
        forecasted_datetime = c_datetime + np.timedelta64(
            forecasted_hours, 'h')
        if forecasted_datetime in datetimes:
            accepted_times_idx.append(i)
            y_times_idx.append(
                np.argwhere(forecasted_datetime == datetimes)[0][0])

    # ****************** Replacing nan columns with the mean value of all the other columns ****************
    mean_values = data_norm_df[stations_columns].mean(1)

    # Replace nan values with -1 and add additional MEAN column
    print(F"Filling nan values....")
    data_norm_df_final = data_norm_df.copy()
    for cur_station in stations_columns:
        data_norm_df_final[cur_station] = data_norm_df[cur_station].fillna(-1)

    data_norm_df_final['MEAN'] = mean_values

    # print(F"Norm params: {scaler.get_params()}")
    # file_name_normparams = join(parameters_folder, F'{model_name}.txt')
    # utilsNN.save_norm_params(file_name_normparams, NormParams.min_max, scaler)
    print("Done!")

    return data_norm_df_final, accepted_times_idx, y_times_idx, stations_columns, meteo_columns