def get_data_loader(args, data_subset, n): """ Given a subset of a dataset as a python dictionary file to make predictions from, this function selects n items at random from that dataset to predict. It then returns a DataLoader for those items, along with a list of ids. """ if not args.rnn: collate = paired_collate_fn else: collate = paired_collate_fn_with_len if n is 0: train_loader = torch.utils.data.DataLoader( ProteinDataset( seqs=data_subset['seq'], crds=data_subset['crd'], angs=data_subset['ang'], ), num_workers=2, batch_size=1, collate_fn=collate, shuffle=False) return train_loader, data_subset["ids"] # We just want to predict a few examples to_predict = set([s.upper() for s in np.random.choice(data_subset["ids"], n)]) # ["2NLP_D", "3ASK_Q", "1SZA_C"] will_predict = [] ids = [] seqs = [] angs = [] crds = [] for i, prot in enumerate(data_subset["ids"]): if prot.upper() in to_predict and prot.upper() not in will_predict: seqs.append(data_subset["seq"][i]) angs.append(data_subset["ang"][i]) crds.append(data_subset["crd"][i]) ids.append(prot) will_predict.append(prot.upper()) assert len(seqs) == n and len(angs) == n or (len(seqs) == len(angs) and len(seqs) < n) data_loader = torch.utils.data.DataLoader( ProteinDataset( seqs=seqs, angs=angs, crds=crds), num_workers=2, batch_size=1, collate_fn=collate, shuffle=False) return data_loader, ids
def setup_data_loaders(batch_size=128, use_cuda=False): train_set = ProteinDataset() test_set = ProteinDataset() #print("dataloader1") kwargs = {'num_workers': 1, 'pin_memory': use_cuda} train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, **kwargs) test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False, **kwargs) #print("daatloader") return train_loader, test_loader
from albumentations import HorizontalFlip, VerticalFlip, Rotate if __name__ == '__main__': # params params = yaml.load(open('config.yaml')) # data transforms main_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize(255*params['mean'], 255*params['std'])]) # defining test dataset test_dataset = ProteinDataset(img_csv_path=params['img_csv_path_test'], mode='test', data_path=params['data_path'], depth=3, img_size=512, transform=None) # defining dataloader test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1, pin_memory=True, num_workers=4) # defining training components model = create_model(model_name=params['model_name'], n_classes=params['n_classes'], device=device, multi_gpu=params['multi_gpu'],
cfg_dict = yaml.load(cfg_file, Loader=yaml.FullLoader) MULTIGPU = cfg_dict.get('multigpu', True) MAX_EPOCH = cfg_dict.get('max_epoch', 30) CHECKPOINT_DIR = cfg_dict.get('checkpoint_dir', 'checkpoint') GENERATOR = cfg_dict.get('generator', {}) DISCRIMINATOR = cfg_dict.get('discriminator', {}) LOSS = cfg_dict.get('loss', {}) GENERATOR_BATCH_SIZE = GENERATOR.get('batch_size', 1) CLASS_NUMBER = GENERATOR.get('output_channel', 10) # Load data & Build dataset TEST_DIR = os.path.join('data', 'test') TEST_FEATURE_DIR = os.path.join(TEST_DIR, 'feature') TEST_LABEL_DIR = os.path.join(TEST_DIR, 'label') test_dataset = ProteinDataset(TEST_FEATURE_DIR, TEST_LABEL_DIR) test_dataloader = DataLoader(test_dataset, batch_size=GENERATOR_BATCH_SIZE, shuffle=True, collate_fn=collate_fn) # Build model from configs generator = ContactMapGenerator(GENERATOR) discriminator = ContactMapDiscriminator(DISCRIMINATOR) # Define Criterion LOSS_ALPHA = LOSS.get( 'alpha', [0.25, 0.25, 0.25, 0.25, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75]) LOSS_BETA = LOSS.get('beta', 1.0) LOSS_GAMMA = LOSS.get('gamma', 2.0) LOSS_LAMBDA = LOSS.get('lambda', 1.0)
MULTIGPU = cfg_dict.get('multigpu', True) MAX_EPOCH = cfg_dict.get('max_epoch', 30) CHECKPOINT_DIR = cfg_dict.get('checkpoint_dir', 'checkpoint') GENERATOR = cfg_dict.get('generator', {}) DISCRIMINATOR = cfg_dict.get('discriminator', {}) TRAINING_CFG = cfg_dict.get('training', {}) LOSS = cfg_dict.get('loss', {}) GENERATOR_BATCH_SIZE = GENERATOR.get('batch_size', 1) CLASS_NUMBER = GENERATOR.get('output_channel', 10) # Load data & Build dataset TRAIN_DIR = os.path.join('data', 'train') TRAIN_FEATURE_DIR = os.path.join(TRAIN_DIR, 'feature') TRAIN_LABEL_DIR = os.path.join(TRAIN_DIR, 'label') train_dataset = ProteinDataset(TRAIN_FEATURE_DIR, TRAIN_LABEL_DIR) train_dataloader = DataLoader(train_dataset, batch_size=GENERATOR_BATCH_SIZE, shuffle=True, collate_fn=collate_fn) VAL_DIR = os.path.join('data', 'val') VAL_FEATURE_DIR = os.path.join(VAL_DIR, 'feature') VAL_LABEL_DIR = os.path.join(VAL_DIR, 'label') val_dataset = ProteinDataset(VAL_FEATURE_DIR, VAL_LABEL_DIR) val_dataloader = DataLoader(val_dataset, batch_size=GENERATOR_BATCH_SIZE, shuffle=True, collate_fn=collate_fn) # Build model from configs
# If the residue is labelled as LIP show it as sphere if lip_indexes[i] == 1: cmd.show_as( "spheres", "chain {} and resi {}".format(residues[i].get_full_id()[2], residues[i].id[1])) # color by B-factor values cmd.spectrum("b", palette="rainbow", selection="(all)") ####################################################### ################# MAIN PROGRAM ##################### ####################################################### # parse the dataset prot_dataset = ProteinDataset() prot_dataset.parse() # create the top-level parser parser = argparse.ArgumentParser(prog='lips_predictor') subparsers = parser.add_subparsers(help='sub-command help') # create the parser for the "downpdb" command parser_down_pdb = subparsers.add_parser('downpdb', help='Download pdb files.') parser_down_pdb.add_argument( '-a', '--all', default=False, action='store_true', help='Download pdb file for every entry in the dataset.') parser_down_pdb.add_argument('-i',
from datetime import datetime as dt if __name__ == '__main__': # params params = yaml.load(open('config.yaml')) # data transforms main_transforms = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(255 * params['mean'], 255 * params['std']) ]) # defining train/val datasets train_dataset = ProteinDataset(img_csv_path=params['img_csv_path_train'], mode='train', data_path=params['data_path'], depth=params['depth'], img_size=params['image_size'], transform=main_transforms) val_dataset = ProteinDataset(img_csv_path=params['img_csv_path_val'], data_path=params['data_path'], mode='val', depth=params['depth'], img_size=params['image_size'], transform=main_transforms) # defining dataloaders weights = pd.read_csv(params['img_csv_path_train']).Weight.values train_loader = DataLoader(train_dataset, sampler=WeightedRandomSampler( weights=weights, num_samples=len(weights)),
data = torch.load(opt.data) to_predict = np.random.choice(data[dataset]["ids"], 3) # ["2NLP_D", "3ASK_Q", "1SZA_C"] actual_order = [] seqs = [] angs = [] for i, prot in enumerate(data[dataset]["ids"]): if prot.upper() in to_predict: seqs.append(data[dataset]["seq"][i]) angs.append(data[dataset]["ang"][i]) actual_order.append(prot) assert len(seqs) == 3 and len(angs) ==3 data_loader = torch.utils.data.DataLoader( ProteinDataset( seqs=seqs,#data[dataset]['seq'], angs=angs),#data[dataset]['ang']), num_workers=2, batch_size=1, collate_fn=paired_collate_fn, shuffle=False) cords_list = [] losses = [] norm_losses = [] # TODO: make batch_level predictions? with torch.no_grad(): for batch in tqdm(data_loader, mininterval=2, desc=' - (Evaluation ', leave=False): # prepare data