def plot_avg_similarities(dataset_name, save_file=False):
    paths = [path for path in os.listdir('results/similarity') if path.startswith(dataset_name)]
    values = [np.mean(read_dataset(dataset_name)['gt_sim'].get_values())]
    embeddings = ['GT']
    for path in paths:
        values.append(np.nanmean(pd.read_csv(f'results/similarity/{path}')['cosine_sim'].get_values()))
        emb_name = path.split('_')
        if emb_name[1] == 'numberbatch':
            embeddings.append(f'{emb_name[2][0].upper()}-{emb_name[1][0].upper()}-{emb_name[3]}')
        else:
            embeddings.append(f'{emb_name[1][0].upper()}-{emb_name[2][0].upper()}-{emb_name[3]}')
    data = pd.DataFrame()
    data['embeddings'] = embeddings
    data['similarities'] = values
    sns.set(style='darkgrid', context='poster', font='Verdana')
    f, ax = plt.subplots()
    sns.barplot(x='embeddings', y='similarities', ax=ax, data=data)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=75)
    ax.axhline(0, color='k', clip_on=False)
    plt.ylim(0, 10)
    for bar, value in zip(ax.patches, data['similarities'].get_values()):
        text_x = bar.get_x() + bar.get_width() / 2.0
        text_y = bar.get_height() + 0.025
        text = f'{round(value, 5)}'
        ax.text(text_x, text_y, text, fontsize=20, ha='center', va='bottom', rotation=90, color='k')
    sns.despine(bottom=True)
    plt.title(dataset_name)
    if save_file:
        figure = plt.gcf()
        figure.set_size_inches(10, 8)
        plt.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.3)
        plt.savefig(f'results/img/{dataset_name}_avg_sim.png')
    else:
        plt.show()
Example #2
0
def calculate_cosine_similarity(dataset_name, emb_name, emb_type, emb_size):
    cosine = list()
    dataset = read_dataset(dataset_name)
    embeddings = read_embeddings(dataset_name, emb_name, emb_type, emb_size)
    for _, row in dataset.iterrows():
        if row['word1'].lower() in embeddings.keys() and row['word2'].lower(
        ) in embeddings.keys():
            vec1 = embeddings[row['word1'].lower()]
            vec2 = embeddings[row['word2'].lower()]
            cosine.append(
                round(cosine_similarity([vec1], [vec2])[0][0] * 10, 2))
        else:
            cosine.append(None)
    dataset['cosine_sim'] = cosine
    dataset.to_csv(
        f'results/similarity/{dataset_name}_{emb_name}_{emb_type}_{emb_size}_cosine.csv',
        index=None)
def plot_similarity(dataset_name, embeddings):
    dataset = read_dataset(dataset_name)
    data = pd.DataFrame()
    pairs = list(range(len(dataset['gt_sim'])))
    embedding_names = ['gt_similarity'] * len(dataset['gt_sim'])
    similarities = list(dataset['gt_sim'].get_values())
    for embedding in embeddings:
        dataset = pd.read_csv(f'results/similarity/{dataset_name}_{embedding}_cosine.csv')
        pairs += list(range(len(dataset['cosine_sim'])))
        embedding_names += [embedding] * len(dataset['cosine_sim'])
        similarities += list(dataset['cosine_sim'].get_values())
    data['pairs'] = pairs
    data['embeddings'] = embedding_names
    data['similarities'] = similarities
    sns.set(style='darkgrid', context='poster', font='Verdana', font_scale=0.5)
    sns.lineplot(x='pairs', y='similarities', hue='embeddings', style='embeddings', dashes=False, data=data)
    plt.show()
Example #4
0
    def process_dataset(self):

        if self.dataset:
            logging.debug("Processing dataset.")
            X, Y = preprocess.read_dataset(self.dataset, balanced=True)
            print X.shape
            Y = to_categorical(Y)
            logging.debug("X example: %s\ny example: %s" % (X[0], Y[0]))
            X_train, X_val, X_test, y_train, y_val, y_test = preprocess.split_dataset(
                X, Y)
            self.num_steps = X.shape[1]
        elif self.train_path:
            X_train, y_train = preprocess.read_set(self.train_path)
            X_train, X_val, y_train, y_val = preprocess.split_dataset(
                X_train, y_train, test_size=0.2, validation=False)
            X_test, y_test = preprocess.read_set(self.test_path)
            self.num_steps = X_train.shape[1]

        return X_train, X_val, X_test, y_train, y_val, y_test
Example #5
0
def anta_normalize(x, y):
    # preprocessing scRNA-seq read counts matrix
    y = y.astype(np.int32)
    adata = sc.AnnData(x)
    adata.obs['Group'] = y

    adata = read_dataset(adata, transpose=False, test_split=False, copy=True)

    adata = process_normalize(adata,
                              size_factors=True,
                              normalize_input=True,
                              logtrans_input=True)

    print(adata.X.shape)

    x_sd = adata.X.std(0)
    x_sd_median = np.median(x_sd)
    print("median of gene sd: %.5f" % x_sd_median)

    x = adata.X.astype(np.float32)
    y = y.astype(np.int32)
    raw_data = adata.raw.X
    return x, y, adata.obs.size_factors, raw_data
    args = parser.parse_args()

    # load dataset
    optimizer1 = Adam(amsgrad=True)
    optimizer2 = 'adadelta'

    data_mat = h5py.File(args.data_file)
    x = np.array(data_mat['X'])
    y = np.array(data_mat['Y'])

    adata = sc.AnnData(x)
    adata.obs['Group'] = y

    adata = read_dataset(adata,
                     transpose=False,
                     test_split=False,
                     copy=True)

    adata = normalize(adata,
                      size_factors=True,
                      normalize_input=True,
                      logtrans_input=True)

    input_size = adata.n_vars

    print(adata.X.shape)
    print(y.shape)

    x_sd = adata.X.std(0)
    x_sd_median = np.median(x_sd)
    print("median of gene sd: %.5f" % x_sd_median)
Example #7
0
 def process_dataset(self):
     X, Y = preprocess.read_dataset(self.dataset, balanced=True)
     return preprocess.split_dataset(X, Y, validation=False)
Example #8
0
    p_train = 0.8
    np.random.seed(42)
    filename = 'data_info.csv'

    add_dir(save_dir_public_train)
    add_dir(save_dir_public_test)
    add_dir(save_dir_private_train)
    add_dir(save_dir_private_test)

    atlas_dataset = 'dataset_1'  # already public, old dataset. all must stay
    # public
    # private dataset, may be split between private and public
    new_datasets = ['dataset_4', 'dataset_5']

    # read datasets
    data_info_atlas = read_dataset(atlas_dataset)
    assert data_info_atlas is not None
    file_list_atlas = list_files(data_info_atlas)

    file_list_new = []
    for dataset in new_datasets:
        data_info_new = read_dataset(dataset)
        assert data_info_new is not None
        file_list_new.extend(list_files(data_info_new))

    n_atlas = len(file_list_atlas)
    n_new = len(file_list_new)
    n_total = n_atlas + n_new

    # calculate the number of samples required in each dataset
    n_public = max(int(0.6 * n_total), n_atlas)
Example #9
0
        save_dir = os.path.join(save_dir, 'public')
    else:
        dataset_name = 'dataset_2'
        filename = 'private.csv'
        save_dir = os.path.join(save_dir, 'private')
    if save_preprocessed_data:
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)

    path_analysis = 'data/data_analysis'
    path_raw = os.path.join(path_analysis, filename)

    if not os.path.exists(path_analysis):
        os.mkdir(path_analysis)

    data_info = read_dataset(dataset_name)
    assert data_info is not None
    file_list = list_files(data_info)

    column_names = [
        'RawPath', 'T1_filename', 'n_lesions', 'RawSize_x', 'RawSize_y',
        'RawSize_z', 'RawLesionSize', 'AverageGrey'
    ]
    if save_preprocessed_data:
        column_names.extend(
            ['NewPath', 'NewT1_name', 'NewMask_name', 'NewAverageGrey'])

    data = []
    idx = 1
    if not len(file_list):
        print(f'No data files found in {data_info["raw_dir"]}')