def rxrx_control_cell_type(cell_type): print(f"Loading metadata for control {cell_type}...") # load metadata df = rio.combine_metadata().reset_index() # select only one cell_type df = df[(df["well_type"]=="positive_control") & (df["cell_type"]==cell_type)] return create_data_dict(df)
def rxrx_cell_type(cell_type): print(f"Loading metadata for {cell_type}...") # load metadata df = rio.combine_metadata().reset_index() # select only one cell_type df = df[df["cell_type"]==cell_type] return create_data_dict(df)
def load_metadata(from_server=False, path="/hdd/LINUX/codes/cell_perturbation/metadata.pickle"): """ returns metadata as pandas.DataFrame """ if from_server: metadata = rio.combine_metadata() pd.to_pickle(metadata, path) else: metadata = pd.read_pickle(path) return metadata
def build_files_index(): print('Reading training data meta.') trn_df = collect_records(TRAIN) trn_df['dataset'] = 'train' print('Reading testing data meta.') tst_df = collect_records(TEST) tst_df['dataset'] = 'test' print('Merging meta-information into single index table.') df = pd.concat([trn_df, tst_df], axis='rows') keys = ['id_code', 'site', 'dataset'] df.set_index(keys, inplace=True) meta = rio.combine_metadata(base_path=ROOT) meta = meta.reset_index().set_index(keys) return df.join(meta).reset_index()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Aug 6 23:07:04 2019 @author: user1 """ import rxrx.io as rio import rxrx.preprocess.images2tfrecords as i2tf images_path = r"../data/raw" dest_path = r"../data/processed/controls" meta_path = r"../data/metadata" md = rio.combine_metadata(base_path = meta_path) #md[(md.dataset == "train") & (md.well_type == "treatment")].drop_duplicates().sort_values(by='sirna') #125510. 51*4*308*2=125664. #mdtreat = md[md.well_type == "treatment"] #mdtest = md[md.dataset == "test"] md.head() md = md[md.well_type != "treatment"] # 12194 rows. 51*4*30*2=12240. 44 voided. #md["dataset"]="test_pos_ctrl" i2tf.pack_tfrecords_ctrl(images_path = images_path, metadata_df = md, num_workers= 12,
# In[3]: # Ref: # rxrx/io.py, line: 14, 15 LOCAL_IMAGES_BASE_PATH = 'D:\\_peng\\recursion-cellular-image-classification' # windows DEFAULT_METADATA_BASE_PATH = LOCAL_IMAGES_BASE_PATH # In[4]: # train.csv, train_controls.csv, test.csv, test_controls.csv md = rio.combine_metadata(base_path=DEFAULT_METADATA_BASE_PATH) # In[5]: md.info # In[6]: md.head() # <strong>Cell Type</strong>
import numpy as np import os, sys, tqdm from progress.bar import Bar import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split sys.path.append('rxrx1-utils/') import rxrx.io as rio import warnings warnings.filterwarnings("ignore") combined_df = rio.combine_metadata() train_df = combined_df[combined_df['dataset'] == 'train'] train_df = train_df[train_df['well_type'] == 'treatment'] print('train_df shape: ', train_df.shape) test_df = combined_df[combined_df['dataset'] == 'test'] test_df = test_df[test_df['well_type'] == 'treatment'] print(test_df.shape) x_train_df, x_valid_df = train_test_split(train_df, test_size=0.20) def create_folder(folderName): if not os.path.exists(folderName): try: os.makedirs(folderName) except OSError as exc: if exc.errno != exc.errno.EEXIST:
ax.axis('off') ax.set_title('channel {}'.format(i + 1)) _ = ax.imshow(t[:, :, i], cmap='gray') x = rio.convert_tensor_to_rgb(t) x.shape plt.figure(figsize=(8, 8)) plt.axis('off') _ = plt.imshow(x) y = rio.load_site_as_rgb('train', 'HUVEC-08', 4, 'K09', 1) plt.figure(figsize=(8, 8)) plt.axis('off') _ = plt.imshow(y) md = rio.combine_metadata() md.head() import seaborn as sns md.head(10) md.index for i in md.columns: print(">> ", i, "\t", md[i].unique()) for col in [ 'cell_type', 'dataset', 'experiment', 'plate', 'site', 'well_type' ]: print(col) print(md[col].value_counts()) sns.countplot(y=col, data=md, order=md[col].value_counts().index) plt.show() missing_values_count = md.isnull().sum()
def main(): args = get_args() print(args) gpus = args.gpus os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = gpus device = 'cuda' n_gpu = len(gpus.split(',')) set_seeds(args.seed) WELL_TYPE = 'treatment' data_path = args.data_path with_plates = args.with_plates model_name = args.model_name exp_suffix = args.exp_suffix bss = list(range(32, 129)) cell_types = ['HEPG2', 'HUVEC', 'RPE', 'U2OS'] root = Path('_'.join([model_name, exp_suffix])) cell_to_model = { 'HEPG2': root / 'HEPG2' / 'seq_train_dev', 'HUVEC': root / 'HUVEC' / 'seq_train_dev', 'RPE': root / 'RPE' / 'seq_train_dev', 'U2OS': root / 'U2OS' / 'seq_train_dev', } dev_tgs, dev_predictions, dev_predictions_fixed = [], [], [] test_ids, predictions, predictions_fixed = [], [], [] all_predictions = [] for CELL_TYPE in cell_types: criterion = nn.BCEWithLogitsLoss() df_ft = rio.combine_metadata(base_path=data_path) df_ft.reset_index(inplace=True) df_ft = df_ft[(df_ft.well_type == WELL_TYPE) & (df_ft.dataset == 'train')].copy() signle_df_ft = df_ft[df_ft['cell_type'] == CELL_TYPE].copy() NUM_CLASSES_FT = len(signle_df_ft.sirna.unique()) signle_df_ft.sirna = signle_df_ft.sirna.apply(np.int64) train_exp_names_ft = sorted(signle_df_ft.experiment.unique()) dev_exp_names_ft = train_exp_names_ft[-1:] train_exp_names_ft = train_exp_names_ft[:-1] print(train_exp_names_ft) print(dev_exp_names_ft) model_ft2 = get_model(name=model_name, num_classes=NUM_CLASSES_FT, with_plates=with_plates).to(device) path_to_pretrained2 = cell_to_model[CELL_TYPE] / 'model.pt' state_dict = torch.load(path_to_pretrained2)['state_dict'] model_ft2.load_state_dict(state_dict, strict=False) FP16 = args.fp16 and IS_APEX if FP16: model_ft2 = apex.amp.initialize(model_ft2, opt_level='O1') if n_gpu > 1: model_ft2 = nn.parallel.DataParallel(model_ft2) loc_tgs = [] loc_dev_preds = [] loc_dev_preds_fixed = [] for exp in dev_exp_names_ft: print(f'exp: {exp}') dev_predictions_bs = [] for bs in bss: print(f'batch: {bs}') train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft = get_loaders( signle_df_ft, train_exp_names_ft, [exp], root=data_path, batch_size=bs * n_gpu, with_plates=with_plates) with torch.no_grad(): loss, acc, preds1, targets1, plates1 = epoch_step( dev_loaders1_ft, f'[ Validating {CELL_TYPE} 1 ({exp}/{bs}).. ]', net=model_ft2, criterion=criterion, device=device, with_preds=True, opt=None, fp16=FP16) print(f'loss site 1: {loss:.4} ({len(preds1)})') print(f'acc site 1: {acc:.4}') loss, acc, preds2, targets2, plates2 = epoch_step( dev_loaders2_ft, f'[ Validating {CELL_TYPE} 2 ({exp}/{bs}).. ]', net=model_ft2, criterion=criterion, device=device, with_preds=True, opt=None, fp16=FP16) print(f'loss site 2: {loss:.4}') print(f'acc site 2: {acc:.4}') assert (targets1 == targets2).all() assert (plates1 == plates2).all() preds = np.mean(np.stack([preds1, preds2]), axis=0) dev_predictions_bs.append(preds) acc = (preds.argmax(-1) == targets1).mean() print(f'acc: {acc:.4}') print() loc_tgs.extend(targets1) preds = np.mean(np.array(dev_predictions_bs), axis=0) print( f'mean over batches: {(preds.argmax(-1) == targets1).mean():.4} ({len(preds)})' ) loc_dev_preds.extend(preds.argmax(-1)) fixed_preds = fix_preds(preds) assert len(fixed_preds) == len(preds), f'{len(fixed_preds)}' print( f'mean over batches (fixed): {(fixed_preds.c.values == targets1).mean():.4}' ) loc_dev_preds_fixed.extend(fixed_preds.c.values) dev_tgs.extend(loc_tgs) dev_predictions.extend(loc_dev_preds) dev_predictions_fixed.extend(loc_dev_preds_fixed) test_df = rio.combine_metadata(base_path=data_path) test_df.reset_index(inplace=True) test_df = test_df[(test_df.well_type == WELL_TYPE) & (test_df.dataset == 'test')].copy() to_test = test_df[test_df['cell_type'] == CELL_TYPE].copy() loc_ids = [] loc_preds = [] loc_preds_fixed = [] loc_preds_all = [] for exp in to_test.experiment.unique(): print(f'exp: {exp}') predictions_bs = [] for bs in bss: print(f'batch: {bs}') test_loaders1, test_loaders2 = get_test_loaders( to_test, [exp], root=data_path, batch_size=bs * n_gpu, with_plates=with_plates) with torch.no_grad(): preds1, ids1, plates1 = predict( test_loaders1, f'[ Testing {CELL_TYPE} 1 ({exp}/{bs}).. ]', net=model_ft2, device=device) print(f'len {len(preds1)}') preds2, ids2, plates2 = predict( test_loaders2, f'[ Testing {CELL_TYPE} 2 ({exp}/{bs}).. ]', net=model_ft2, device=device) assert (ids1 == ids2).all() assert (plates1 == plates2).all() preds = np.mean(np.stack([preds1, preds2]), axis=0) assert len(ids1) == len(preds) predictions_bs.append(preds) loc_ids.extend(ids1) preds = np.mean(np.array(predictions_bs), axis=0) loc_preds.extend(preds.argmax(-1)) fixed_preds = fix_preds(preds) assert len(fixed_preds) == len(preds) loc_preds_fixed.extend(fixed_preds.c.values) loc_preds_all.extend(preds) test_ids.extend(loc_ids) predictions.extend(loc_preds) predictions_fixed.extend(loc_preds_fixed) all_predictions.extend(loc_preds_all) assert len(test_ids) == len(predictions) == len(predictions_fixed) dev_tgs, dev_predictions, dev_predictions_fixed = map( np.array, [dev_tgs, dev_predictions, dev_predictions_fixed]) all_predictions = np.array(all_predictions) print(f'acc : {(dev_tgs == dev_predictions).mean():.4}') print(f'acc (fixed) : {(dev_tgs == dev_predictions_fixed).mean():.4}') to_sub = pd.DataFrame(zip( test_ids, predictions, predictions_fixed, *all_predictions.T, ), columns=[ 'id_code', 'sirna', 'sirna_fixed', ] + [f'p_{i}' for i in range(NUM_CLASSES_FT)]) to_sub.to_csv(f'submission_SUB_ACC16_p.csv', index=False) # plate "leak" train_csv = pd.read_csv(data_path / 'train.csv') test_csv = pd.read_csv(data_path / 'test.csv') test_csv = pd.merge(test_csv, to_sub, how='left', on='id_code') sub = pd.read_csv(f'submission_SUB_ACC16_p.csv') assert (test_csv.id_code.values == sub.id_code.values).all() plate_groups = np.zeros((NUM_CLASSES_FT, 4), int) for sirna in range(NUM_CLASSES_FT): grp = train_csv.loc[train_csv.sirna == sirna, :].plate.value_counts().index.values assert len(grp) == 3 plate_groups[sirna, 0:3] = grp plate_groups[sirna, 3] = 10 - grp.sum() all_test_exp = test_csv.experiment.unique() group_plate_probs = np.zeros((len(all_test_exp), 4)) for idx in range(len(all_test_exp)): preds = sub.loc[test_csv.experiment == all_test_exp[idx], 'sirna_fixed'].values pp_mult = np.zeros((len(preds), NUM_CLASSES_FT)) pp_mult[range(len(preds)), preds] = 1 sub_test = test_csv.loc[test_csv.experiment == all_test_exp[idx], :] assert len(pp_mult) == len(sub_test) for j in range(4): mask = np.repeat(plate_groups[np.newaxis, :, j], len(pp_mult), axis=0) == \ np.repeat(sub_test.plate.values[:, np.newaxis], NUM_CLASSES_FT, axis=1) group_plate_probs[idx, j] = np.array(pp_mult)[mask].sum() / len(pp_mult) exp_to_group = group_plate_probs.argmax(1) def select_plate_group(pp_mult, idx): sub_test = test_csv.loc[test_csv.experiment == all_test_exp[idx], :] assert len(pp_mult) == len(sub_test) mask = np.repeat(plate_groups[np.newaxis, :, exp_to_group[idx]], len(pp_mult), axis=0) != \ np.repeat(sub_test.plate.values[:, np.newaxis], NUM_CLASSES_FT, axis=1) pp_mult[mask] = 0 return pp_mult for idx in range(len(all_test_exp)): indices = (test_csv.experiment == all_test_exp[idx]) preds = test_csv[indices].copy() preds = preds[[f'p_{i}' for i in range(NUM_CLASSES_FT)]].values preds = select_plate_group(preds, idx) sub.loc[indices, 'sirna_leak'] = preds.argmax(1) preds_fixed = fix_preds(preds) assert len(preds_fixed) == len(preds) sub.loc[indices, 'sirna_leak_fixed'] = preds_fixed.c.values sub.to_csv(f'submission_SUB_ACC16_p_leak.csv', index=False)
def main(): args = get_args() print(args) gpus = args.gpus os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = gpus device = 'cuda' n_gpu = len(gpus.split(',')) set_seeds(args.seed) WELL_TYPE = 'treatment' CELL_TYPE = args.cell_type data_path = args.data_path with_plates = args.with_plates model_name = args.model_name exp_suffix = args.exp_suffix FP16 = args.fp16 and IS_APEX batch_size = args.batch_size epochs = args.epochs lr = args.lr criterion = nn.BCEWithLogitsLoss() df = rio.combine_metadata(base_path=data_path) df.reset_index(inplace=True) df = df[(df.well_type != WELL_TYPE)].copy() signle_df = df[df['cell_type'] == CELL_TYPE].copy() NUM_CLASSES = len(signle_df.sirna.unique()) mapping = { cl: ind for ind, cl in enumerate(sorted(signle_df.sirna.unique())) } signle_df.sirna = signle_df.sirna.apply(lambda x: mapping[x]) train_exp_names = sorted( signle_df[signle_df.dataset == 'train'].experiment.unique()) dev_exp_names = sorted( signle_df[signle_df.dataset == 'test'].experiment.unique()) train_loaders, dev_loaders1, dev_loaders2 = get_loaders( signle_df, train_exp_names, dev_exp_names, root=data_path, batch_size=batch_size, n_gpu=n_gpu, with_plates=with_plates) path_to_exp = Path('_'.join([model_name, exp_suffix ])) / CELL_TYPE / 'seq_pretrain' if not path_to_exp.exists(): path_to_exp.mkdir(parents=True) model = get_model(name=model_name, num_classes=NUM_CLASSES, with_plates=with_plates).to(device) opt = torch.optim.Adam(model.parameters(), lr=lr, amsgrad=True) if FP16: model, opt = apex.amp.initialize(model, opt, opt_level='O1') if n_gpu > 1: model = nn.parallel.DataParallel(model) scheduler = None train_model((train_loaders, dev_loaders1, dev_loaders2), model=model, criterion=criterion, opt=opt, path=path_to_exp, device=device, fp16=FP16, epochs=epochs, scheduler=scheduler) # # pretrain head path_to_pretrained = path_to_exp / 'model.pt' df_ft = rio.combine_metadata(base_path=data_path) df_ft.reset_index(inplace=True) df_ft = df_ft[(df_ft.well_type == WELL_TYPE) & (df_ft.dataset == 'train')].copy() signle_df_ft = df_ft[df_ft['cell_type'] == CELL_TYPE].copy() NUM_CLASSES_FT = len(signle_df_ft.sirna.unique()) signle_df_ft.sirna = signle_df_ft.sirna.apply(np.int64) train_exp_names_ft = sorted(signle_df_ft.experiment.unique()) dev_exp_names_ft = train_exp_names_ft[-1:] train_exp_names_ft = train_exp_names_ft[:-1] train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft = get_loaders( signle_df_ft, train_exp_names_ft, dev_exp_names_ft, root=data_path, batch_size=batch_size, n_gpu=n_gpu, with_plates=with_plates) path_to_exp_ft = Path('_'.join([model_name, exp_suffix ])) / CELL_TYPE / 'seq_train_head' if not path_to_exp_ft.exists(): path_to_exp_ft.mkdir(parents=True) model_ft = get_model(name=model_name, num_classes=NUM_CLASSES_FT, with_plates=with_plates).to(device) state_dict = torch.load(path_to_pretrained)['state_dict'] state_dict.pop('classifier.weight') state_dict.pop('classifier.bias') model_ft.load_state_dict(state_dict, strict=False) for n, p in model_ft.named_parameters(): if not n.startswith('classifier'): p.requires_grad = False opt_ft = torch.optim.Adam(filter(lambda p: p.requires_grad, model_ft.parameters()), lr=lr, amsgrad=True) if FP16: model_ft, opt_ft = apex.amp.initialize(model_ft, opt_ft, opt_level='O1') if n_gpu > 1: model_ft = nn.parallel.DataParallel(model_ft) scheduler = None train_model((train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft), model=model_ft, criterion=criterion, opt=opt_ft, path=path_to_exp_ft, device=device, fp16=FP16, epochs=epochs + 50, scheduler=scheduler) # finetune whole model path_to_exp_ft2 = Path('_'.join([model_name, exp_suffix ])) / CELL_TYPE / 'seq_train' if not path_to_exp_ft2.exists(): path_to_exp_ft2.mkdir(parents=True) model_ft2 = get_model(name=model_name, num_classes=NUM_CLASSES_FT, with_plates=with_plates).to(device) path_to_pretrained2 = path_to_exp_ft / 'model.pt' state_dict = torch.load(path_to_pretrained2)['state_dict'] model_ft2.load_state_dict(state_dict) opt_ft2 = torch.optim.Adam(model_ft2.parameters(), lr=lr, amsgrad=True) if FP16: model_ft2, opt_ft2 = apex.amp.initialize(model_ft2, opt_ft2, opt_level='O1') if n_gpu > 1: model_ft2 = nn.parallel.DataParallel(model_ft2) scheduler = torch.optim.lr_scheduler.MultiStepLR(opt_ft2, milestones=[120, 150], gamma=args.gamma) train_model((train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft), model=model_ft2, criterion=criterion, opt=opt_ft2, path=path_to_exp_ft2, device=device, fp16=FP16, epochs=epochs + 75, scheduler=scheduler) # finetune on validation path_to_exp_ft3 = Path('_'.join([model_name, exp_suffix ])) / CELL_TYPE / 'seq_train_dev' if not path_to_exp_ft3.exists(): path_to_exp_ft3.mkdir(parents=True) opt_ft2 = torch.optim.Adam(model_ft2.parameters(), lr=1e-5, amsgrad=True) if FP16: model_ft2, opt_ft2 = apex.amp.initialize(model_ft2, opt_ft2, opt_level='O1') if n_gpu > 1: model_ft2 = nn.parallel.DataParallel(model_ft2) train_model( (list(it.chain(train_loaders_ft, dev_loaders1_ft, dev_loaders2_ft)), dev_loaders1_ft, dev_loaders2_ft), model=model_ft2, criterion=criterion, opt=opt_ft2, path=path_to_exp_ft3, device=device, fp16=FP16, epochs=15, scheduler=None)
def rxrx_all(): print("Loading metadata for all images...") # load metadata df = rio.combine_metadata().reset_index() return create_data_dict(df)