Example #1
0
def rxrx_control_cell_type(cell_type):
    print(f"Loading metadata for control {cell_type}...")
    # load metadata
    df = rio.combine_metadata().reset_index()
    # select only one cell_type
    df = df[(df["well_type"]=="positive_control") & (df["cell_type"]==cell_type)]
    
    return create_data_dict(df)
Example #2
0
def rxrx_cell_type(cell_type):
    print(f"Loading metadata for {cell_type}...")
    # load metadata
    df = rio.combine_metadata().reset_index()
    # select only one cell_type
    df = df[df["cell_type"]==cell_type]
    
    return create_data_dict(df)
Example #3
0
def load_metadata(from_server=False,
                  path="/hdd/LINUX/codes/cell_perturbation/metadata.pickle"):
    """
    returns metadata as pandas.DataFrame
    """
    if from_server:
        metadata = rio.combine_metadata()
        pd.to_pickle(metadata, path)
    else:
        metadata = pd.read_pickle(path)

    return metadata
Example #4
0
def build_files_index():
    print('Reading training data meta.')
    trn_df = collect_records(TRAIN)
    trn_df['dataset'] = 'train'
    print('Reading testing data meta.')
    tst_df = collect_records(TEST)
    tst_df['dataset'] = 'test'
    print('Merging meta-information into single index table.')
    df = pd.concat([trn_df, tst_df], axis='rows')
    keys = ['id_code', 'site', 'dataset']
    df.set_index(keys, inplace=True)
    meta = rio.combine_metadata(base_path=ROOT)
    meta = meta.reset_index().set_index(keys)
    return df.join(meta).reset_index()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug  6 23:07:04 2019

@author: user1
"""

import rxrx.io as rio
import rxrx.preprocess.images2tfrecords as i2tf 

images_path = r"../data/raw"
dest_path = r"../data/processed/controls"
meta_path = r"../data/metadata"

md = rio.combine_metadata(base_path = meta_path)
#md[(md.dataset == "train") & (md.well_type == "treatment")].drop_duplicates().sort_values(by='sirna')

#125510. 51*4*308*2=125664. 
#mdtreat = md[md.well_type == "treatment"]
#mdtest = md[md.dataset == "test"]

md.head()
md = md[md.well_type != "treatment"]

# 12194 rows. 51*4*30*2=12240. 44 voided.
#md["dataset"]="test_pos_ctrl"

i2tf.pack_tfrecords_ctrl(images_path = images_path,
                   metadata_df = md,
                   num_workers= 12,
# In[3]:


# Ref:
#     rxrx/io.py, line: 14, 15

LOCAL_IMAGES_BASE_PATH = 'D:\\_peng\\recursion-cellular-image-classification'  # windows
DEFAULT_METADATA_BASE_PATH = LOCAL_IMAGES_BASE_PATH


# In[4]:


# train.csv, train_controls.csv, test.csv, test_controls.csv

md = rio.combine_metadata(base_path=DEFAULT_METADATA_BASE_PATH)


# In[5]:


md.info


# In[6]:


md.head()


# <strong>Cell Type</strong>
Example #7
0
import numpy as np
import os, sys, tqdm
from progress.bar import Bar
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

sys.path.append('rxrx1-utils/')
import rxrx.io as rio

import warnings

warnings.filterwarnings("ignore")

combined_df = rio.combine_metadata()
train_df = combined_df[combined_df['dataset'] == 'train']
train_df = train_df[train_df['well_type'] == 'treatment']
print('train_df shape: ', train_df.shape)

test_df = combined_df[combined_df['dataset'] == 'test']
test_df = test_df[test_df['well_type'] == 'treatment']
print(test_df.shape)

x_train_df, x_valid_df = train_test_split(train_df, test_size=0.20)


def create_folder(folderName):
    if not os.path.exists(folderName):
        try:
            os.makedirs(folderName)
        except OSError as exc:
            if exc.errno != exc.errno.EEXIST:
Example #8
0
    ax.axis('off')
    ax.set_title('channel {}'.format(i + 1))
    _ = ax.imshow(t[:, :, i], cmap='gray')
x = rio.convert_tensor_to_rgb(t)
x.shape
plt.figure(figsize=(8, 8))
plt.axis('off')

_ = plt.imshow(x)
y = rio.load_site_as_rgb('train', 'HUVEC-08', 4, 'K09', 1)

plt.figure(figsize=(8, 8))
plt.axis('off')

_ = plt.imshow(y)
md = rio.combine_metadata()
md.head()
import seaborn as sns
md.head(10)
md.index
for i in md.columns:
    print(">> ", i, "\t", md[i].unique())
for col in [
        'cell_type', 'dataset', 'experiment', 'plate', 'site', 'well_type'
]:
    print(col)
    print(md[col].value_counts())
    sns.countplot(y=col, data=md, order=md[col].value_counts().index)
    plt.show()

missing_values_count = md.isnull().sum()
def main():
    args = get_args()
    print(args)

    gpus = args.gpus
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
    device = 'cuda'

    n_gpu = len(gpus.split(','))
    set_seeds(args.seed)
    WELL_TYPE = 'treatment'
    data_path = args.data_path
    with_plates = args.with_plates
    model_name = args.model_name
    exp_suffix = args.exp_suffix
    bss = list(range(32, 129))
    cell_types = ['HEPG2', 'HUVEC', 'RPE', 'U2OS']
    root = Path('_'.join([model_name, exp_suffix]))
    cell_to_model = {
        'HEPG2': root / 'HEPG2' / 'seq_train_dev',
        'HUVEC': root / 'HUVEC' / 'seq_train_dev',
        'RPE': root / 'RPE' / 'seq_train_dev',
        'U2OS': root / 'U2OS' / 'seq_train_dev',
    }

    dev_tgs, dev_predictions, dev_predictions_fixed = [], [], []
    test_ids, predictions, predictions_fixed = [], [], []
    all_predictions = []
    for CELL_TYPE in cell_types:
        criterion = nn.BCEWithLogitsLoss()

        df_ft = rio.combine_metadata(base_path=data_path)
        df_ft.reset_index(inplace=True)
        df_ft = df_ft[(df_ft.well_type == WELL_TYPE)
                      & (df_ft.dataset == 'train')].copy()
        signle_df_ft = df_ft[df_ft['cell_type'] == CELL_TYPE].copy()

        NUM_CLASSES_FT = len(signle_df_ft.sirna.unique())

        signle_df_ft.sirna = signle_df_ft.sirna.apply(np.int64)

        train_exp_names_ft = sorted(signle_df_ft.experiment.unique())
        dev_exp_names_ft = train_exp_names_ft[-1:]
        train_exp_names_ft = train_exp_names_ft[:-1]
        print(train_exp_names_ft)
        print(dev_exp_names_ft)

        model_ft2 = get_model(name=model_name,
                              num_classes=NUM_CLASSES_FT,
                              with_plates=with_plates).to(device)
        path_to_pretrained2 = cell_to_model[CELL_TYPE] / 'model.pt'
        state_dict = torch.load(path_to_pretrained2)['state_dict']
        model_ft2.load_state_dict(state_dict, strict=False)
        FP16 = args.fp16 and IS_APEX
        if FP16:
            model_ft2 = apex.amp.initialize(model_ft2, opt_level='O1')
        if n_gpu > 1:
            model_ft2 = nn.parallel.DataParallel(model_ft2)

        loc_tgs = []
        loc_dev_preds = []
        loc_dev_preds_fixed = []
        for exp in dev_exp_names_ft:
            print(f'exp: {exp}')
            dev_predictions_bs = []
            for bs in bss:
                print(f'batch: {bs}')
                train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft = get_loaders(
                    signle_df_ft,
                    train_exp_names_ft, [exp],
                    root=data_path,
                    batch_size=bs * n_gpu,
                    with_plates=with_plates)
                with torch.no_grad():
                    loss, acc, preds1, targets1, plates1 = epoch_step(
                        dev_loaders1_ft,
                        f'[ Validating {CELL_TYPE} 1 ({exp}/{bs}).. ]',
                        net=model_ft2,
                        criterion=criterion,
                        device=device,
                        with_preds=True,
                        opt=None,
                        fp16=FP16)
                    print(f'loss site 1: {loss:.4} ({len(preds1)})')
                    print(f'acc site 1: {acc:.4}')

                    loss, acc, preds2, targets2, plates2 = epoch_step(
                        dev_loaders2_ft,
                        f'[ Validating {CELL_TYPE} 2 ({exp}/{bs}).. ]',
                        net=model_ft2,
                        criterion=criterion,
                        device=device,
                        with_preds=True,
                        opt=None,
                        fp16=FP16)
                    print(f'loss site 2: {loss:.4}')
                    print(f'acc site 2: {acc:.4}')

                    assert (targets1 == targets2).all()
                    assert (plates1 == plates2).all()
                    preds = np.mean(np.stack([preds1, preds2]), axis=0)
                    dev_predictions_bs.append(preds)
                    acc = (preds.argmax(-1) == targets1).mean()
                    print(f'acc: {acc:.4}')

                print()
            loc_tgs.extend(targets1)
            preds = np.mean(np.array(dev_predictions_bs), axis=0)
            print(
                f'mean over batches: {(preds.argmax(-1) == targets1).mean():.4} ({len(preds)})'
            )
            loc_dev_preds.extend(preds.argmax(-1))
            fixed_preds = fix_preds(preds)
            assert len(fixed_preds) == len(preds), f'{len(fixed_preds)}'
            print(
                f'mean over batches (fixed): {(fixed_preds.c.values == targets1).mean():.4}'
            )
            loc_dev_preds_fixed.extend(fixed_preds.c.values)

        dev_tgs.extend(loc_tgs)
        dev_predictions.extend(loc_dev_preds)
        dev_predictions_fixed.extend(loc_dev_preds_fixed)

        test_df = rio.combine_metadata(base_path=data_path)
        test_df.reset_index(inplace=True)
        test_df = test_df[(test_df.well_type == WELL_TYPE)
                          & (test_df.dataset == 'test')].copy()
        to_test = test_df[test_df['cell_type'] == CELL_TYPE].copy()

        loc_ids = []
        loc_preds = []
        loc_preds_fixed = []
        loc_preds_all = []
        for exp in to_test.experiment.unique():
            print(f'exp: {exp}')
            predictions_bs = []
            for bs in bss:
                print(f'batch: {bs}')
                test_loaders1, test_loaders2 = get_test_loaders(
                    to_test, [exp],
                    root=data_path,
                    batch_size=bs * n_gpu,
                    with_plates=with_plates)
                with torch.no_grad():
                    preds1, ids1, plates1 = predict(
                        test_loaders1,
                        f'[ Testing {CELL_TYPE} 1 ({exp}/{bs}).. ]',
                        net=model_ft2,
                        device=device)
                    print(f'len {len(preds1)}')
                    preds2, ids2, plates2 = predict(
                        test_loaders2,
                        f'[ Testing {CELL_TYPE} 2 ({exp}/{bs}).. ]',
                        net=model_ft2,
                        device=device)

                    assert (ids1 == ids2).all()
                    assert (plates1 == plates2).all()
                    preds = np.mean(np.stack([preds1, preds2]), axis=0)
                    assert len(ids1) == len(preds)
                    predictions_bs.append(preds)

            loc_ids.extend(ids1)

            preds = np.mean(np.array(predictions_bs), axis=0)
            loc_preds.extend(preds.argmax(-1))
            fixed_preds = fix_preds(preds)
            assert len(fixed_preds) == len(preds)
            loc_preds_fixed.extend(fixed_preds.c.values)

            loc_preds_all.extend(preds)

        test_ids.extend(loc_ids)
        predictions.extend(loc_preds)
        predictions_fixed.extend(loc_preds_fixed)
        all_predictions.extend(loc_preds_all)

        assert len(test_ids) == len(predictions) == len(predictions_fixed)

    dev_tgs, dev_predictions, dev_predictions_fixed = map(
        np.array, [dev_tgs, dev_predictions, dev_predictions_fixed])
    all_predictions = np.array(all_predictions)
    print(f'acc           : {(dev_tgs == dev_predictions).mean():.4}')
    print(f'acc (fixed)   : {(dev_tgs == dev_predictions_fixed).mean():.4}')
    to_sub = pd.DataFrame(zip(
        test_ids,
        predictions,
        predictions_fixed,
        *all_predictions.T,
    ),
                          columns=[
                              'id_code',
                              'sirna',
                              'sirna_fixed',
                          ] + [f'p_{i}' for i in range(NUM_CLASSES_FT)])
    to_sub.to_csv(f'submission_SUB_ACC16_p.csv', index=False)

    # plate "leak"
    train_csv = pd.read_csv(data_path / 'train.csv')
    test_csv = pd.read_csv(data_path / 'test.csv')
    test_csv = pd.merge(test_csv, to_sub, how='left', on='id_code')
    sub = pd.read_csv(f'submission_SUB_ACC16_p.csv')
    assert (test_csv.id_code.values == sub.id_code.values).all()
    plate_groups = np.zeros((NUM_CLASSES_FT, 4), int)
    for sirna in range(NUM_CLASSES_FT):
        grp = train_csv.loc[train_csv.sirna ==
                            sirna, :].plate.value_counts().index.values
        assert len(grp) == 3
        plate_groups[sirna, 0:3] = grp
        plate_groups[sirna, 3] = 10 - grp.sum()

    all_test_exp = test_csv.experiment.unique()

    group_plate_probs = np.zeros((len(all_test_exp), 4))
    for idx in range(len(all_test_exp)):
        preds = sub.loc[test_csv.experiment == all_test_exp[idx],
                        'sirna_fixed'].values
        pp_mult = np.zeros((len(preds), NUM_CLASSES_FT))
        pp_mult[range(len(preds)), preds] = 1

        sub_test = test_csv.loc[test_csv.experiment == all_test_exp[idx], :]
        assert len(pp_mult) == len(sub_test)

        for j in range(4):
            mask = np.repeat(plate_groups[np.newaxis, :, j], len(pp_mult), axis=0) == \
                   np.repeat(sub_test.plate.values[:, np.newaxis], NUM_CLASSES_FT, axis=1)

            group_plate_probs[idx,
                              j] = np.array(pp_mult)[mask].sum() / len(pp_mult)
    exp_to_group = group_plate_probs.argmax(1)

    def select_plate_group(pp_mult, idx):
        sub_test = test_csv.loc[test_csv.experiment == all_test_exp[idx], :]
        assert len(pp_mult) == len(sub_test)
        mask = np.repeat(plate_groups[np.newaxis, :, exp_to_group[idx]], len(pp_mult), axis=0) != \
               np.repeat(sub_test.plate.values[:, np.newaxis], NUM_CLASSES_FT, axis=1)
        pp_mult[mask] = 0

        return pp_mult

    for idx in range(len(all_test_exp)):
        indices = (test_csv.experiment == all_test_exp[idx])

        preds = test_csv[indices].copy()
        preds = preds[[f'p_{i}' for i in range(NUM_CLASSES_FT)]].values

        preds = select_plate_group(preds, idx)
        sub.loc[indices, 'sirna_leak'] = preds.argmax(1)

        preds_fixed = fix_preds(preds)
        assert len(preds_fixed) == len(preds)
        sub.loc[indices, 'sirna_leak_fixed'] = preds_fixed.c.values

    sub.to_csv(f'submission_SUB_ACC16_p_leak.csv', index=False)
def main():
    args = get_args()
    print(args)

    gpus = args.gpus
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
    device = 'cuda'

    n_gpu = len(gpus.split(','))
    set_seeds(args.seed)
    WELL_TYPE = 'treatment'
    CELL_TYPE = args.cell_type
    data_path = args.data_path
    with_plates = args.with_plates
    model_name = args.model_name
    exp_suffix = args.exp_suffix
    FP16 = args.fp16 and IS_APEX
    batch_size = args.batch_size
    epochs = args.epochs
    lr = args.lr

    criterion = nn.BCEWithLogitsLoss()

    df = rio.combine_metadata(base_path=data_path)
    df.reset_index(inplace=True)
    df = df[(df.well_type != WELL_TYPE)].copy()
    signle_df = df[df['cell_type'] == CELL_TYPE].copy()
    NUM_CLASSES = len(signle_df.sirna.unique())

    mapping = {
        cl: ind
        for ind, cl in enumerate(sorted(signle_df.sirna.unique()))
    }
    signle_df.sirna = signle_df.sirna.apply(lambda x: mapping[x])

    train_exp_names = sorted(
        signle_df[signle_df.dataset == 'train'].experiment.unique())
    dev_exp_names = sorted(
        signle_df[signle_df.dataset == 'test'].experiment.unique())

    train_loaders, dev_loaders1, dev_loaders2 = get_loaders(
        signle_df,
        train_exp_names,
        dev_exp_names,
        root=data_path,
        batch_size=batch_size,
        n_gpu=n_gpu,
        with_plates=with_plates)

    path_to_exp = Path('_'.join([model_name, exp_suffix
                                 ])) / CELL_TYPE / 'seq_pretrain'
    if not path_to_exp.exists():
        path_to_exp.mkdir(parents=True)

    model = get_model(name=model_name,
                      num_classes=NUM_CLASSES,
                      with_plates=with_plates).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, amsgrad=True)
    if FP16:
        model, opt = apex.amp.initialize(model, opt, opt_level='O1')
    if n_gpu > 1:
        model = nn.parallel.DataParallel(model)

    scheduler = None
    train_model((train_loaders, dev_loaders1, dev_loaders2),
                model=model,
                criterion=criterion,
                opt=opt,
                path=path_to_exp,
                device=device,
                fp16=FP16,
                epochs=epochs,
                scheduler=scheduler)

    #     # pretrain head
    path_to_pretrained = path_to_exp / 'model.pt'
    df_ft = rio.combine_metadata(base_path=data_path)
    df_ft.reset_index(inplace=True)
    df_ft = df_ft[(df_ft.well_type == WELL_TYPE)
                  & (df_ft.dataset == 'train')].copy()
    signle_df_ft = df_ft[df_ft['cell_type'] == CELL_TYPE].copy()

    NUM_CLASSES_FT = len(signle_df_ft.sirna.unique())

    signle_df_ft.sirna = signle_df_ft.sirna.apply(np.int64)

    train_exp_names_ft = sorted(signle_df_ft.experiment.unique())
    dev_exp_names_ft = train_exp_names_ft[-1:]
    train_exp_names_ft = train_exp_names_ft[:-1]

    train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft = get_loaders(
        signle_df_ft,
        train_exp_names_ft,
        dev_exp_names_ft,
        root=data_path,
        batch_size=batch_size,
        n_gpu=n_gpu,
        with_plates=with_plates)
    path_to_exp_ft = Path('_'.join([model_name, exp_suffix
                                    ])) / CELL_TYPE / 'seq_train_head'
    if not path_to_exp_ft.exists():
        path_to_exp_ft.mkdir(parents=True)
    model_ft = get_model(name=model_name,
                         num_classes=NUM_CLASSES_FT,
                         with_plates=with_plates).to(device)
    state_dict = torch.load(path_to_pretrained)['state_dict']
    state_dict.pop('classifier.weight')
    state_dict.pop('classifier.bias')
    model_ft.load_state_dict(state_dict, strict=False)
    for n, p in model_ft.named_parameters():
        if not n.startswith('classifier'):
            p.requires_grad = False
    opt_ft = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                     model_ft.parameters()),
                              lr=lr,
                              amsgrad=True)

    if FP16:
        model_ft, opt_ft = apex.amp.initialize(model_ft,
                                               opt_ft,
                                               opt_level='O1')
    if n_gpu > 1:
        model_ft = nn.parallel.DataParallel(model_ft)
    scheduler = None
    train_model((train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft),
                model=model_ft,
                criterion=criterion,
                opt=opt_ft,
                path=path_to_exp_ft,
                device=device,
                fp16=FP16,
                epochs=epochs + 50,
                scheduler=scheduler)

    # finetune whole model
    path_to_exp_ft2 = Path('_'.join([model_name, exp_suffix
                                     ])) / CELL_TYPE / 'seq_train'
    if not path_to_exp_ft2.exists():
        path_to_exp_ft2.mkdir(parents=True)
    model_ft2 = get_model(name=model_name,
                          num_classes=NUM_CLASSES_FT,
                          with_plates=with_plates).to(device)
    path_to_pretrained2 = path_to_exp_ft / 'model.pt'
    state_dict = torch.load(path_to_pretrained2)['state_dict']
    model_ft2.load_state_dict(state_dict)
    opt_ft2 = torch.optim.Adam(model_ft2.parameters(), lr=lr, amsgrad=True)
    if FP16:
        model_ft2, opt_ft2 = apex.amp.initialize(model_ft2,
                                                 opt_ft2,
                                                 opt_level='O1')
    if n_gpu > 1:
        model_ft2 = nn.parallel.DataParallel(model_ft2)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(opt_ft2,
                                                     milestones=[120, 150],
                                                     gamma=args.gamma)
    train_model((train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft),
                model=model_ft2,
                criterion=criterion,
                opt=opt_ft2,
                path=path_to_exp_ft2,
                device=device,
                fp16=FP16,
                epochs=epochs + 75,
                scheduler=scheduler)

    # finetune on validation
    path_to_exp_ft3 = Path('_'.join([model_name, exp_suffix
                                     ])) / CELL_TYPE / 'seq_train_dev'
    if not path_to_exp_ft3.exists():
        path_to_exp_ft3.mkdir(parents=True)
    opt_ft2 = torch.optim.Adam(model_ft2.parameters(), lr=1e-5, amsgrad=True)
    if FP16:
        model_ft2, opt_ft2 = apex.amp.initialize(model_ft2,
                                                 opt_ft2,
                                                 opt_level='O1')
    if n_gpu > 1:
        model_ft2 = nn.parallel.DataParallel(model_ft2)
    train_model(
        (list(it.chain(train_loaders_ft, dev_loaders1_ft,
                       dev_loaders2_ft)), dev_loaders1_ft, dev_loaders2_ft),
        model=model_ft2,
        criterion=criterion,
        opt=opt_ft2,
        path=path_to_exp_ft3,
        device=device,
        fp16=FP16,
        epochs=15,
        scheduler=None)
Example #11
0
def rxrx_all():
    print("Loading metadata for all images...")
    # load metadata
    df = rio.combine_metadata().reset_index()
    
    return create_data_dict(df)