def main(args): # Word to index dictionary amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV'] if args.model_type == 'lstm': amino_to_ix = { amino: index for index, amino in enumerate(['PAD'] + amino_acids) } if args.model_type == 'ae': pep_atox = { amino: index for index, amino in enumerate(['PAD'] + amino_acids) } tcr_atox = { amino: index for index, amino in enumerate(amino_acids + ['X']) } # hyper-params arg = {} arg['train_auc_file'] = args.train_auc_file if args.train_auc_file else 'ignore' arg['test_auc_file'] = args.test_auc_file if args.test_auc_file else 'ignore' if args.test_auc_file == 'auto': dir = 'memory_and_protein' p_key = 'protein' if args.protein else '' arg['test_auc_file'] = dir + '/' + '_'.join( [args.model_type, args.dataset, args.sampling, p_key]) arg['ae_file'] = args.ae_file if args.ae_file == 'auto': args.ae_file = 'TCR_Autoencoder/tcr_autoencoder.pt' arg['ae_file'] = 'TCR_Autoencoder/tcr_autoencoder.pt' pass arg['siamese'] = False params = {} params['lr'] = 1e-3 params['wd'] = 1e-5 params['epochs'] = 200 params['batch_size'] = 50 params['lstm_dim'] = 30 params['emb_dim'] = 10 params['dropout'] = 0.1 params['option'] = 0 params['enc_dim'] = 30 params['train_ae'] = True # Load autoencoder params if args.model_type == 'ae': checkpoint = torch.load(args.ae_file) params['max_len'] = checkpoint['max_len'] params['batch_size'] = checkpoint['batch_size'] # Load data if args.dataset == 'mcpas': datafile = 'data/McPAS-TCR.csv' if args.dataset == 'vdjdb': datafile = 'data/VDJDB_complete.tsv' train, test = ergo_data_loader.load_data(datafile, args.dataset, args.sampling, _protein=args.protein, _hla=args.hla) # Save test if args.test_data_file == 'auto': dir = 'memory_and_protein' p_key = 'protein' if args.protein else '' args.test_data_file = dir + '/' + '_'.join( [args.model_type, args.dataset, args.sampling, p_key, 'test']) if args.test_data_file: with open(args.test_data_file + '.pickle', 'wb') as handle: pickle.dump(test, handle) if args.model_type == 'ae': # train train_tcrs, train_peps, train_signs = ae_get_lists_from_pairs( train, params['max_len']) train_batches = ae.get_batches(train_tcrs, train_peps, train_signs, tcr_atox, pep_atox, params['batch_size'], params['max_len']) # test test_tcrs, test_peps, test_signs = ae_get_lists_from_pairs( test, params['max_len']) test_batches = ae.get_batches(test_tcrs, test_peps, test_signs, tcr_atox, pep_atox, params['batch_size'], params['max_len']) # Train the model model, best_auc, best_roc = ae.train_model(train_batches, test_batches, args.device, arg, params) pass if args.model_type == 'lstm': # train train_tcrs, train_peps, train_signs = lstm_get_lists_from_pairs(train) lstm.convert_data(train_tcrs, train_peps, amino_to_ix) train_batches = lstm.get_batches(train_tcrs, train_peps, train_signs, params['batch_size']) # test test_tcrs, test_peps, test_signs = lstm_get_lists_from_pairs(test) lstm.convert_data(test_tcrs, test_peps, amino_to_ix) test_batches = lstm.get_batches(test_tcrs, test_peps, test_signs, params['batch_size']) # Train the model model, best_auc, best_roc = lstm.train_model(train_batches, test_batches, args.device, arg, params) pass # Save trained model if args.model_file == 'auto': dir = 'memory_and_protein' p_key = 'protein' if args.protein else '' args.model_file = dir + '/' + '_'.join( [args.model_type, args.dataset, args.sampling, p_key, 'model.pt']) if args.model_file: torch.save({ 'model_state_dict': model.state_dict(), }, args.model_file) if args.roc_file: # Save best ROC curve and AUC np.savez(args.roc_file, fpr=best_roc[0], tpr=best_roc[1], auc=np.array(best_auc)) pass
def main(args): # Word to index dictionary amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV'] if args.model_type == 'lstm': amino_to_ix = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)} if args.model_type == 'ae': pep_atox = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)} tcr_atox = {amino: index for index, amino in enumerate(amino_acids + ['X'])} # hyper-params arg = {} arg['train_auc_file'] = args.train_auc_file if args.train_auc_file else 'ignore' arg['test_auc_file'] = args.test_auc_file if args.test_auc_file else 'ignore' if args.test_auc_file == 'auto': dir = 'save_results' p_key = 'protein' if args.protein else '' arg['test_auc_file'] = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key]) arg['ae_file'] = args.ae_file if args.ae_file == 'auto': args.ae_file = 'TCR_Autoencoder/tcr_ae_dim_30.pt' arg['ae_file'] = 'TCR_Autoencoder/tcr_ae_dim_30.pt' pass arg['siamese'] = False params = {} params['lr'] = 1e-4 params['wd'] = 0 params['epochs'] = 100 if args.dataset == 'tumor': params['epochs'] = 25 params['batch_size'] = 50 params['lstm_dim'] = 500 params['emb_dim'] = 10 params['dropout'] = 0.1 params['option'] = 0 params['enc_dim'] = 100 params['train_ae'] = True # Load autoencoder params if args.model_type == 'ae': args.ae_file = 'TCR_Autoencoder/tcr_ae_dim_' + str(params['enc_dim']) + '.pt' arg['ae_file'] = args.ae_file checkpoint = torch.load(args.ae_file, map_location=args.device) params['max_len'] = checkpoint['max_len'] params['batch_size'] = checkpoint['batch_size'] # Load data if args.dataset == 'mcpas': datafile = r'data/nine_class_testdata.csv' elif args.dataset == 'vdjdb': datafile = r'data/VDJDB_complete.tsv' elif args.dataset == 'united': datafile = {'mcpas': r'data/nine_class_testdata.csv', 'vdjdb': r'data/VDJDB_complete.tsv'} elif args.dataset == 'tumor': datafile = r'tumor/extended_cancer_pairs' elif args.dataset == 'nettcr': datafile = r'NetTCR/iedb_mira_pos_uniq' #train, test = ergo_data_loader.load_data(datafile, args.dataset, args.sampling, #_protein=False, _hla=False) # Save train if args.train_data_file == 'auto': dir = 'save_results' p_key = 'protein' if args.protein else '' args.train_data_file = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key, 'train']) if args.train_data_file: with open(args.train_data_file + '.pickle', 'rb') as handle: train = pickle.load(handle) # Save test if args.test_data_file == 'auto': dir = 'final_results' p_key = 'protein' if args.protein else '' args.test_data_file = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key, 'test']) if args.test_data_file: with open(args.test_data_file + '.pickle', 'rb') as handle2: test = pickle.load(handle2) if args.model_type == 'ae': # train train_tcrs, train_peps, train_signs = ae_get_lists_from_pairs(train, params['max_len']) train_batches = ae.get_batches(train_tcrs, train_peps, train_signs, tcr_atox, pep_atox, params['batch_size'], params['max_len']) # test test_tcrs, test_peps, test_signs = ae_get_lists_from_pairs(test, params['max_len']) test_batches = ae.get_batches(test_tcrs, test_peps, test_signs, tcr_atox, pep_atox, params['batch_size'], params['max_len']) # Train the model model, best_auc, best_roc = ae.train_model(train_batches, test_batches, args.device, arg, params) pass if args.model_type == 'lstm': # train train_tcrs, train_peps, train_signs = lstm_get_lists_from_pairs(train) lstm.convert_data(train_tcrs, train_peps, amino_to_ix) train_batches = lstm.get_batches(train_tcrs, train_peps, train_signs, params['batch_size']) # test test_tcrs, test_peps, test_signs = lstm_get_lists_from_pairs(test) lstm.convert_data(test_tcrs, test_peps, amino_to_ix) test_batches = lstm.get_batches(test_tcrs, test_peps, test_signs, params['batch_size']) # Train the model model, best_auc, best_roc = lstm.train_model(train_batches, test_batches, args.device, arg, params) pass # Save trained model if args.model_file == 'auto': dir = 'final_results' p_key = 'protein' if args.protein else '' args.model_file = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key, 'model.pt']) if args.model_file: torch.save({ 'model_state_dict': model.state_dict(), 'params': params }, args.model_file) if args.roc_file: # Save best ROC curve and AUC np.savez(args.roc_file, fpr=best_roc[0], tpr=best_roc[1], auc=np.array(best_auc)) pass