Example #1
0
def main(args):
    # Word to index dictionary
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    if args.model_type == 'lstm':
        amino_to_ix = {
            amino: index
            for index, amino in enumerate(['PAD'] + amino_acids)
        }
    if args.model_type == 'ae':
        pep_atox = {
            amino: index
            for index, amino in enumerate(['PAD'] + amino_acids)
        }
        tcr_atox = {
            amino: index
            for index, amino in enumerate(amino_acids + ['X'])
        }

    # hyper-params
    arg = {}
    arg['train_auc_file'] = args.train_auc_file if args.train_auc_file else 'ignore'
    arg['test_auc_file'] = args.test_auc_file if args.test_auc_file else 'ignore'
    if args.test_auc_file == 'auto':
        dir = 'memory_and_protein'
        p_key = 'protein' if args.protein else ''
        arg['test_auc_file'] = dir + '/' + '_'.join(
            [args.model_type, args.dataset, args.sampling, p_key])
    arg['ae_file'] = args.ae_file
    if args.ae_file == 'auto':
        args.ae_file = 'TCR_Autoencoder/tcr_autoencoder.pt'
        arg['ae_file'] = 'TCR_Autoencoder/tcr_autoencoder.pt'
        pass
    arg['siamese'] = False
    params = {}
    params['lr'] = 1e-3
    params['wd'] = 1e-5
    params['epochs'] = 200
    params['batch_size'] = 50
    params['lstm_dim'] = 30
    params['emb_dim'] = 10
    params['dropout'] = 0.1
    params['option'] = 0
    params['enc_dim'] = 30
    params['train_ae'] = True

    # Load autoencoder params
    if args.model_type == 'ae':
        checkpoint = torch.load(args.ae_file)
        params['max_len'] = checkpoint['max_len']
        params['batch_size'] = checkpoint['batch_size']

    # Load data
    if args.dataset == 'mcpas':
        datafile = 'data/McPAS-TCR.csv'
    if args.dataset == 'vdjdb':
        datafile = 'data/VDJDB_complete.tsv'
    train, test = ergo_data_loader.load_data(datafile,
                                             args.dataset,
                                             args.sampling,
                                             _protein=args.protein,
                                             _hla=args.hla)
    # Save test
    if args.test_data_file == 'auto':
        dir = 'memory_and_protein'
        p_key = 'protein' if args.protein else ''
        args.test_data_file = dir + '/' + '_'.join(
            [args.model_type, args.dataset, args.sampling, p_key, 'test'])
    if args.test_data_file:
        with open(args.test_data_file + '.pickle', 'wb') as handle:
            pickle.dump(test, handle)

    if args.model_type == 'ae':
        # train
        train_tcrs, train_peps, train_signs = ae_get_lists_from_pairs(
            train, params['max_len'])
        train_batches = ae.get_batches(train_tcrs, train_peps, train_signs,
                                       tcr_atox, pep_atox,
                                       params['batch_size'], params['max_len'])
        # test
        test_tcrs, test_peps, test_signs = ae_get_lists_from_pairs(
            test, params['max_len'])
        test_batches = ae.get_batches(test_tcrs, test_peps, test_signs,
                                      tcr_atox, pep_atox, params['batch_size'],
                                      params['max_len'])
        # Train the model
        model, best_auc, best_roc = ae.train_model(train_batches, test_batches,
                                                   args.device, arg, params)
        pass
    if args.model_type == 'lstm':
        # train
        train_tcrs, train_peps, train_signs = lstm_get_lists_from_pairs(train)
        lstm.convert_data(train_tcrs, train_peps, amino_to_ix)
        train_batches = lstm.get_batches(train_tcrs, train_peps, train_signs,
                                         params['batch_size'])
        # test
        test_tcrs, test_peps, test_signs = lstm_get_lists_from_pairs(test)
        lstm.convert_data(test_tcrs, test_peps, amino_to_ix)
        test_batches = lstm.get_batches(test_tcrs, test_peps, test_signs,
                                        params['batch_size'])
        # Train the model
        model, best_auc, best_roc = lstm.train_model(train_batches,
                                                     test_batches, args.device,
                                                     arg, params)
        pass

    # Save trained model
    if args.model_file == 'auto':
        dir = 'memory_and_protein'
        p_key = 'protein' if args.protein else ''
        args.model_file = dir + '/' + '_'.join(
            [args.model_type, args.dataset, args.sampling, p_key, 'model.pt'])
    if args.model_file:
        torch.save({
            'model_state_dict': model.state_dict(),
        }, args.model_file)
    if args.roc_file:
        # Save best ROC curve and AUC
        np.savez(args.roc_file,
                 fpr=best_roc[0],
                 tpr=best_roc[1],
                 auc=np.array(best_auc))
    pass
Example #2
0
def main(args):
    # Word to index dictionary
    amino_acids = [letter for letter in 'ARNDCEQGHILKMFPSTWYV']
    if args.model_type == 'lstm':
        amino_to_ix = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
    if args.model_type == 'ae':
        pep_atox = {amino: index for index, amino in enumerate(['PAD'] + amino_acids)}
        tcr_atox = {amino: index for index, amino in enumerate(amino_acids + ['X'])}

    # hyper-params
    arg = {}
    arg['train_auc_file'] = args.train_auc_file if args.train_auc_file else 'ignore'
    arg['test_auc_file'] = args.test_auc_file if args.test_auc_file else 'ignore'
    if args.test_auc_file == 'auto':
        dir = 'save_results'
        p_key = 'protein' if args.protein else ''
        arg['test_auc_file'] = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key])
    arg['ae_file'] = args.ae_file
    if args.ae_file == 'auto':
        args.ae_file = 'TCR_Autoencoder/tcr_ae_dim_30.pt'
        arg['ae_file'] = 'TCR_Autoencoder/tcr_ae_dim_30.pt'
        pass
    arg['siamese'] = False
    params = {}
    params['lr'] = 1e-4
    params['wd'] = 0
    params['epochs'] = 100
    if args.dataset == 'tumor':
        params['epochs'] = 25
    params['batch_size'] = 50
    params['lstm_dim'] = 500
    params['emb_dim'] = 10
    params['dropout'] = 0.1
    params['option'] = 0
    params['enc_dim'] = 100
    params['train_ae'] = True

    # Load autoencoder params
    if args.model_type == 'ae':
        args.ae_file = 'TCR_Autoencoder/tcr_ae_dim_' + str(params['enc_dim']) + '.pt'
        arg['ae_file'] = args.ae_file
        checkpoint = torch.load(args.ae_file, map_location=args.device)
        params['max_len'] = checkpoint['max_len']
        params['batch_size'] = checkpoint['batch_size']

    # Load data
    if args.dataset == 'mcpas':
        datafile = r'data/nine_class_testdata.csv'
    elif args.dataset == 'vdjdb':
        datafile = r'data/VDJDB_complete.tsv'
    elif args.dataset == 'united':
        datafile = {'mcpas': r'data/nine_class_testdata.csv', 'vdjdb': r'data/VDJDB_complete.tsv'}
    elif args.dataset == 'tumor':
        datafile = r'tumor/extended_cancer_pairs'
    elif args.dataset == 'nettcr':
        datafile = r'NetTCR/iedb_mira_pos_uniq'
   #train, test = ergo_data_loader.load_data(datafile, args.dataset, args.sampling,
                                             #_protein=False, _hla=False)
    # Save train
    if args.train_data_file == 'auto':
        dir = 'save_results'
        p_key = 'protein' if args.protein else ''
        args.train_data_file = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key, 'train'])
    if args.train_data_file:
        with open(args.train_data_file + '.pickle', 'rb') as handle:
            train = pickle.load(handle)

    # Save test
    if args.test_data_file == 'auto':
        dir = 'final_results'
        p_key = 'protein' if args.protein else ''
        args.test_data_file = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key, 'test'])
    if args.test_data_file:
        with open(args.test_data_file + '.pickle', 'rb') as handle2:
            test = pickle.load(handle2)

    if args.model_type == 'ae':
        # train
        train_tcrs, train_peps, train_signs = ae_get_lists_from_pairs(train, params['max_len'])
        train_batches = ae.get_batches(train_tcrs, train_peps, train_signs, tcr_atox, pep_atox, params['batch_size'], params['max_len'])
        # test
        test_tcrs, test_peps, test_signs = ae_get_lists_from_pairs(test, params['max_len'])
        test_batches = ae.get_batches(test_tcrs, test_peps, test_signs, tcr_atox, pep_atox, params['batch_size'], params['max_len'])
        # Train the model
        model, best_auc, best_roc = ae.train_model(train_batches, test_batches, args.device, arg, params)
        pass
    if args.model_type == 'lstm':
        # train
        train_tcrs, train_peps, train_signs = lstm_get_lists_from_pairs(train)
        lstm.convert_data(train_tcrs, train_peps, amino_to_ix)
        train_batches = lstm.get_batches(train_tcrs, train_peps, train_signs, params['batch_size'])
        # test
        test_tcrs, test_peps, test_signs = lstm_get_lists_from_pairs(test)
        lstm.convert_data(test_tcrs, test_peps, amino_to_ix)
        test_batches = lstm.get_batches(test_tcrs, test_peps, test_signs, params['batch_size'])
        # Train the model
        model, best_auc, best_roc = lstm.train_model(train_batches, test_batches, args.device, arg, params)
        pass

    # Save trained model
    if args.model_file == 'auto':
        dir = 'final_results'
        p_key = 'protein' if args.protein else ''
        args.model_file = dir + '/' + '_'.join([args.model_type, args.dataset, args.sampling, p_key, 'model.pt'])
    if args.model_file:
        torch.save({
                    'model_state_dict': model.state_dict(),
                    'params': params
                    }, args.model_file)
    if args.roc_file:
        # Save best ROC curve and AUC
        np.savez(args.roc_file, fpr=best_roc[0], tpr=best_roc[1], auc=np.array(best_auc))
    pass