def process_files(from_dir, this_file):

    stn = this_file.split('/')[-1]
    print("Prossecing station: ", stn)
    path = create_path(path_join(output_dir, from_dir, stn))

    for init_time, domain in combined_list:

        var_metrics = []
        for var in variables.itervalues():

            file_names = '_'.join([var, domain, '*']) + ".txt"
            stages = glob(path_join(this_file, init_time, file_names))

            # sorting stages
            stages_idx = [s.split('_')[-1].split('.')[0] for s in stages]
            stages_idx = np.argsort(np.array(stages_idx, dtype='int32'))
            stages = [stages[idx] for idx in stages_idx]

            # computing all metrics for each variable at the same stage
            s_metrics = [
                compute_metrics(stg, metrics.values()) for stg in stages
            ]
            var_metrics.append(s_metrics)

        var_metrics = np.transpose(var_metrics, axes=(2, 1, 0))
        for name, arr in zip(metrics.keys(), var_metrics):

            sf = path_join(path, '_'.join([name, init_time, domain]) + '.txt')
            dframe = pd.DataFrame(data=arr, columns=variables.values())
            dframe.to_csv(sf, index=True, header=True, sep=' ', na_rep='nan')
Ejemplo n.º 2
0
def traverse_and_process():
    # For data augmentation
    # Used to apply augmentor with 50% probability
    sometimes = lambda aug: va.Sometimes(0.5, aug)
    seq = va.Sequential([
        # Randomly crop video with a size of (96 x 96)
        va.RandomCrop(size=(96, 96)),
        # Randomly rotates the video with a degree randomly choosen from [-10, 10]
        va.RandomRotate(degrees=10),
        # horizontally flip the video with 50% probability
        sometimes(va.HorizontalFlip())
    ])

    foldernames = []

    for root, dirs, files in os.walk(config.VIDEO_DIREC):
        if len(dirs) > 0:
            foldernames = sorted(dirs)

    for folder in tqdm(foldernames, desc='Folder', bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'):

        filenames = []
        for root, dirs, files in os.walk(f'{config.VIDEO_DIREC}/{folder}'):
            filenames = sorted(file.split(".")[0] for file in list(
                filter(lambda x: x != ".DS_Store", files)))

            for filename in tqdm(filenames[:20], desc='Class ', bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'):
                # for filename in filenames:
                for iter in tqdm(range(10), desc='Files ', bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'):
                    # Check if dst folder exists
                    utils.create_path(f'{config.CROPPED_DIREC}/{(int(folder) - 1)}{iter}')
                    # Set the paths
                    src_path = f'{config.VIDEO_DIREC}/{folder}/{filename}.mp4'
                    dst_path = f'{config.CROPPED_DIREC}/{(int(folder) - 1)}{iter}/{filename}.npz'

                    # utils.check_video_length(src_path, verbose=True)
                    sequence = detect_face_from_file(src_path, verbose=False)
                    # print(type(sequence), sequence.shape)
                    if config.AUGMENT and iter != 0:
                        sequence = np.array(seq(sequence))
                    assert sequence is not None, f'Cannot crop from {src_path}.'
                    # print(sequence.shape)
                    # ... = Ellipsis
                    data = transform.convert_bgr2gray(
                        sequence) if config.CONVERT_GRAY else sequence[..., ::-1]

                    utils.save2npz(dst_path, data=data)
Ejemplo n.º 3
0
def cluster_embed(dataset_name, app_name, embedding_name, embedding_df, labels):
    clusterer = AgglomerativeClustering(n_clusters=len(labels.unique()))

    clusterer.fit(embedding_df.values)

    clustered_labels = clusterer.labels_

    cluster_labels_df = pd.DataFrame({"cluster_label": clustered_labels, "label":labels}, index=embedding_df.index)

    path = create_path([".", "data", "clusters", dataset_name, app_name], file_name = f"{embedding_name}.csv")

    cluster_labels_df.to_csv(path)
Ejemplo n.º 4
0
def sort_points_by_ref(dataset_name, app_name, embedding_name, embeddings,
                       labels):
    # First, find the euclidean distances between every embedding
    mutual_distances = get_pairwise_dist(embeddings, metric="euclidean")

    # Get the text associated with each feedback
    text_df = get_feedback_text(dataset_name, app_name)

    # Find a place to save this visualisation
    finder_dir = create_path([".", "results", "", dataset_name, app_name],
                             file_name=embedding_name)

    # Create the visualisation
    create_nn_finder_html(mutual_distances, text_df, finder_dir)
def plot_files(combined_list):

    # get values from iterable:
    metric, stn_id, domain = combined_list

    print("Creating '{}' plots for station: {}, domain: {}".format(
        metric, stn_id, domain))

    # get files from model and mos at the same time:
    sorted_files = []
    for from_dir in from_dirs:
        path = path_join(output_dir, from_dir, stn_id,
                         '_'.join([metric, '*', domain]) + '*')
        sorted_files.extend(sorted(glob(path)))

    plot_stages = []
    plot_values = []
    for this_file in sorted_files:

        dframe = pd.read_csv(this_file, delimiter=' ', index_col=0)

        # create a nice string representation of forecast hours
        # (adding stages to initialization hour)
        init = this_file.split('.')[0].split('_')[-2][:2]
        stages = [
            str(timedelta(hours=int(init) + 3.0 * off_set)).split(', ')[-1]
            for off_set in range(dframe.shape[0])
        ]
        plot_stages.append(stages)
        plot_values.append(dframe)

    for name, var in variables.iteritems():

        # list containning values for all initializations of the same
        # variable (var) for the same domain...
        data = [values[var].values for values in plot_values]

        # finally doing some actual plotting
        fig = plot_lines(plot_stages, data, name, stn_id, domain, metric)

        fig_path = create_path(path_join(output_dir, 'plots', stn_id))
        fig_name = path_join(fig_path, '_'.join([metric, var, domain]))

        fig.savefig(fig_name + '.png', dpi=200)
        plt.close(fig)
Ejemplo n.º 6
0
def create_results_dirs(result_type, dataset_name, app_name):
    app_dir = create_path(
        [".", "results", result_type, dataset_name, app_name])
    return app_dir
Ejemplo n.º 7
0
    def generate_config(self):
        env_file = json.load(
            open(os.path.join(str(Path.home()), ".kaggle/path.json"), "r"))
        competition_file = json.load(
            open(os.path.join(os.getcwd(), "competition.json"), "r"))
        aug_json = json.load(
            open(os.path.join(os.getcwd(), "configs/service/aug.json"), "r"))
        self.config.update(competition_file)
        self.config['augs'] = aug_json
        self.config['dataset'].update(competition_file['dataset_split'])
        self.config['siamese'] = self.config.get("siamese", False)

        if self.config['siamese'] & self.config['dataset']['resize'][0] >= 512:
            self.config["competition_data_folder"] = "protein"
            self.config["competition_img"]["type"] = {
                "test": "tif",
                "train": "tif"
            }
        elif self.config['competition_data_folder'] == "protein/protein_1024":
            self.config["competition_img"]["type"] = {
                "test": "jpg",
                "train": "jpg"
            }
        elif self.config['competition_data_folder'] == "protein/protein_512":
            self.config["competition_img"]["type"] = {
                "test": "png",
                "train": "png"
            }
        else:
            raise NotImplementedError("No such data folder")

        self.config['n_threds'] = cpu_count()
        self.config['device'] = "cuda" if torch.cuda.is_available() else "cpu"
        self.config["net_name"] = self.config['net_class'].split(".")[-1]
        self.config['out_path'] = os.path.join(env_file['output_path'],
                                               self.config['competition_name'])
        self.config['out_folder'] = os.path.join(self.config['out_path'],
                                                 self.get_folder_type())
        self.config['out_folder'], counter = utils.create_path(
            self.config['out_folder'])
        self.config[
            'visdom_env_name'] = self.config['net_name'] + f"_{counter}"
        self.config['data_folder'] = os.path.join(
            env_file['data_path'], self.config['competition_data_folder'])
        self.config['predictions_folder'] = os.path.join(
            self.config['out_folder'], "predicitons")
        self.config['weights_folder'] = os.path.join(self.config['out_folder'],
                                                     "weights")
        self.config['splits_path'] = os.path.join(
            str(Path.home()),
            ".kaggle_splits",
            self.config['competition_name'],
            "debug" if self.is_debug() else "",
        )
        if self.config['pretrained_weights'] is not None:
            self.config['pretrained_weights'] = os.path.join(
                os.path.split(self.config['out_folder'])[0],
                self.config['pretrained_weights'])
        if self.config['use_folds'] == "all":
            self.config['use_folds'] = list(
                range(self.config['dataset']['n_folds']))
        self.config['debug'] = False
        if self.is_debug():
            self.config['n_epochs'] = 2
            self.config['mode_train']['unfreeze'] = 1
            self.config['debug'] = True
            # self.config['mode_stack']['early_stopping_rounds'] = 5
            # self.config['mode_stack']["num_rounds"] = 10

        assert len(self.config['mode_train']
                   ['loss']) <= 1, "Cannot be more than 1 loss type"
        assert self.config['competition_type'] in [
            'binary', 'multilabel', 'multiclass', 'segmentation'
        ]
Ejemplo n.º 8
0
# define pipeline pre-processor:
preprocessor = Pipeline([("features", combined_features),
                         ("scaler", x_scaler)])

if __name__ == "__main__":

    # Create the dataset
    path = 'data/train_data/'
    ref_var = ['rain', 'mslp', 't2m', 'rh2m', 'wind']

    var_key = 'rain'
    init_time = '00'
    domain = 'd01'

    # path for saving models
    save_path = create_path(os.path.join('models', var_key))

    ref_var.remove(var_key)
    file_name = '_'.join([var_key] + ref_var + [domain, init_time]) + '.mat'
    # file_name = 't2m_d01_00.mat'

    print('Loading data from: {}'.format(file_name))
    data = loadmat(path + file_name, squeeze_me=True)

    print('Loading station clusters...')
    clusters = loadmat('data/clustered_stations_' + var_key + '.mat')

    # get reference scores:
    # (only save trained model if its score is higher than
    #  the previous computed best score for this cluster)
    scores_path = 'data/best_scores_' + var_key + '.mat'
Ejemplo n.º 9
0
    combined_list = combine([['00', '12'], ['d01', 'd02'],
                             ['rain', 'mslp', 't2m', 'rh2m', 'wind']])

    # create pool of workers:
    if np.isscalar(n_process):
        n_process = min(1, int(n_process))
    else:
        n_process = cpu_count()

    print('Running with {:n} workers.'.format(n_process))
    pool = Pool(n_process)

    # running in parallel
    function = partial(process_item, path_input, path_output, unique)

    pool.map(function, combined_list)
    pool.close()
    pool.join()


if __name__ == '__main__':

    # create training dataset
    path_input = 'data/stn_vs_raw'
    path_output = create_path('data/train_data')

    # assingle_items(path_input, path_output, unique=True)

    var_key = 'rain'
    combined_items(path_input, path_output, var_key)
def apply_regression(path_input,
                     path_output,
                     var,
                     init_times=[],
                     domains=[],
                     regressors=None,
                     verbose=False):

    ind = var_keys.index(var)
    #
    for init_time, domain in combine([init_times, domains]):

        # file_name patterns to search
        wildcards = [
            '*'.join([var_key, domain] + ['']) for var_key in var_keys
        ]

        print('find files with patterns: ', wildcards,
              'from initialization: ', init_time, '\n')

        stn_files = os.listdir(path_input)

        init_time += '00'
        stacked_correlations = {}
        for stn_id in stn_files:
            stn = int(stn_id.split('_')[-1])
            if verbose:
                print('Creating bias-corrected data for station: ', stn)

            path = create_path(
                os.path.join(path_output, stn_id, init_time))

            var_files = [
                sorted(
                    glob(
                        os.path.join(path_input, stn_id, init_time,
                                     wildcard))) for wildcard in wildcards
            ]

            # crate dictionary for computed correlations (one for each stage)
            stage_correlations = {s: None for s in range(25)}
            for same_stages in zip(*var_files):

                stage = same_stages[0].split('.')[0].split('_')[-1]

                dates = []
                observations = []
                predictions = []
                for this_file in same_stages:

                    data = pd.read_table(
                        this_file,
                        names=names,
                        sep=' ',
                        converters=date_converter,
                        engine='c')

                    # remove nan model values
                    find_nans = np.isnan(data['forecast'].values)

                    indexs = [not nans for nans in find_nans]

                    # append member to existing ones
                    dates.append(data['dates'].values[indexs])
                    observations.append(data['observed'].values[indexs])
                    predictions.append(data['forecast'].values[indexs])
                dates = np.array(dates)
                if dates.shape[0] == len(var_files) and dates.ndim != 1:

                    # only if all predictors share the same dates
                    if not np.diff(dates, axis=0).all():

                        # creating predictors (some physics here): mslp, temp, relh
                        if var is 'rain':
                            x = np.stack(predictions, axis=0)
                            x[-1], x[-2], x[-3] = apply_physics(
                                x[-1], x[-2], x[-3])
                            # normalization factor for target variable
                            scale = 1.0  # mm / 3h
                        else:
                            # (temp, press, wind?) no need for extra
                            # predictions (for now) we will only use
                            # model forecasts: only simple normalization
                            x = np.expand_dims(predictions[ind], axis=0)
                            # normalization factor for target variable
                            scale = 36.0  # degrees celcius
                            x /= scale

                        x_pred = x.T
                        if hasattr(regressors, 'predict'):

                            corrected = regressors.predict(x_pred) * scale

                        elif regressors.has_key(stn_id):

                            corrected = regressors[stn_id].predict(
                                x_pred) * scale

                        # just use forecast values
                        else:
                            print(
                                'MOS is not available for this station at stage: ',
                                str2stamp(
                                    str(dates[0]),
                                    off_set=3 * int(stage),
                                    to_julian=False,
                                    as_int=False),
                                ' using forecast instead')
                            corrected = predictions[ind]

                        stacked_array = np.stack(
                            [dates[0], observations[ind], corrected],
                            axis=-1)

                        # write stn predictions to disk
                        file_name = '_'.join([var, domain, stage])
                        file_name = os.path.join(path, file_name)
                        np.savetxt(file_name + '.txt', stacked_array)

                        stage_correlations[int(
                            stage)] = pearson_correlation(
                                stacked_array[:, 1], stacked_array[:, 2])

                else:
                    # if station is no suitable for bias-correction
                    # algorithm then use model forecast only:
                    stage_correlations[int(stage)] = pearson_correlation(
                        observations[ind], predictions[ind])

            # write pearson correlations for gridded mos
            stacked_correlations[stn] = stage_correlations.values()

        # write scv file for make plots:
        stacked_correlations = pd.DataFrame(data=stacked_correlations)

        file_name = '_'.join(['pearson', init_time, domain]) + '.csv'
        path = create_path(
            os.path.join(
                'test_data/processed/CorrelacionesPuntuales/PearsonT/MOS/pearson',
                var))

        print('Save file: ', file_name, 'with data shaped: ',
              stacked_correlations.values.shape, '\n')

        stacked_correlations.to_csv(
            os.path.join(path, file_name),
            sep=',',
            na_rep='nan',
            header=list(stacked_correlations.columns),
            index=False,
            mode='w')

        print('Post-processing finished. Congrats!')
if __name__ == '__main__':
    '''
        This script generates the data for each station after appling the
        regression equation found for that station and then save the file
        in the same fashion as the inputs: /stn_***/init/var_d0*_*.txt

        The bias-corrected series does not cover the same dates because
        simultaneous ocurrences of the predictors is not granted and
        of course the nans filtering process
        (this should be seriously revised for better regression models)
    '''

    # create training dataset
    path_input = 'data/stn_vs_raw/'
    path_output = create_path('data/stn_vs_mos/')

    var_key = 'rain'
    init_times = ['00']  #, '12']
    domains = ['d01']  #, 'd02']:

    regressors = {'stn_308': Dummy_Regressor, 'stn_320': Dummy_Regressor}

    apply_regression(
        path_input,
        path_output,
        var_key,
        init_times=init_times,
        domains=domains,
        regressors=regressors)
                              cmap=cm.plasma_r,
                              norm=norm,
                              orientation='vertical',
                              label='Color mapping for values of ' +
                              xlabels[-1])

    return ax.get_figure()


if __name__ == '__main__':

    # Create the dataset
    ref_var = ['rain', 'mslp', 't2m', 'rh2m']

    path = 'data/train_data/'
    path_output = create_path('outputs/test_scalers/')

    var_key = 't2m'
    init_time = '00'
    domain = 'd01'

    ref_var.remove(var_key)
    file_name = '_'.join([var_key] + ref_var + [domain, init_time]) + '.mat'

    print('Loading data...')
    data = loadmat(path + file_name, squeeze_me=True)
    #
    for key in ['__header__', '__version__', '__globals__']:
        _ = data.pop(key)

    # join all data