def main():
    args = parser.parse_args()

    graph = get_graph_from_hyperparameters(args.pred)
    batch_size = 32

    ##Load graph
    epoch_range = map(int, args.epoch.split(','))
    #graph.set_weights(initial_weights)
    for epoch in range(epoch_range[0], epoch_range[1]) :
        graph.load_weights(HOME_PATH + '/weights' + str(max_ic50) + '/'  + args.pred + '/weights' + str(batch_size) + '_'  + str(epoch) )

        allele_sequence_data, max_allele_length = load_allele_sequence_data(HOME_PATH + '/pan_allele/files/pseudo/pseudo_sequences.fasta')

        predictions = read_tcell_predictions(HOME_PATH + '/pan_allele/files/iedb-tcell-2009-negative.csv',HOME_PATH + '/pan_allele/files/iedb-tcell-2009-positive.csv')


        allele_list = sorted(predictions.keys())
        allele_list[:] = [x for x in allele_list if not x.startswith('C')]
        Y_true = []
        Y_pred = []
        for allele in allele_list:

            peptides = predictions[allele].keys()
            for peptide in peptides:
                if(len(peptide)>7 and len(peptide)<12):
                    #print allele, peptide, predictions[allele][peptide], 20000**(1-make_prediction(peptide, allele_sequence_data[allele], graph))
                    Y_true.append( predictions[allele][peptide])
                    Y_pred.append(make_prediction(peptide, allele_sequence_data[allele], graph))
            #print "=====", allele, sum(Y_true), len(Y_true), "===="
        score = scores(Y_true, Y_pred)
        print epoch, ','.join(map(str,score[1:]))
def main():
    args = parser.parse_args()
    graph = get_graph_from_hyperparameters(args.pred)
    batch_sizes = [32]
    learning_rates = [0.001]

    for lr in learning_rates:
        for batch_size in batch_sizes:
            save_model(graph, args.pred, args.batch_size, args.epochs, max_ic50=args.max_ic50)
Esempio n. 3
0
def main():
    args = parser.parse_args()
    max_ic50 = args.max_ic50

    #IEDB data
    allele_groups, df = load_binding_data(BINDING_DATA_PATH, max_ic50=max_ic50, peptide_length=9)

    #graph initialized here so that pseudo sequences are made accordingly
    graph = get_graph_from_hyperparameters(args.pred)

    #allele sequence data
    allele_sequence_data, max_allele_length = load_allele_sequence_data(SEQUENCE_DATA_PATH)
    allele_list = sorted(create_allele_list(allele_groups, allele_sequence_data))

    #reading blind data from txt file that contains aggregated data for all alleles
    blind_allele_groups, blind_df = load_binding_data('blind_data.txt', max_ic50=max_ic50, peptide_length=None)
    blind_allele_list = sorted(create_allele_list(blind_allele_groups, allele_sequence_data))

    nb_iter = 50 #number of networks to include in the ensemble

    preds_allele = defaultdict(list)
    actual_allele = defaultdict(list)

    for i in range(0,nb_iter):

        #get_model_data shuffles the data so theres no need for further shuffling
        peptides, mhc, Y = get_model_data(  allele_list,
                                            allele_sequence_data,
                                            allele_groups,
                                            peptide_length = 9,
                                            mhc_length=max_allele_length
                                         )

        #splitting peptides, mhcs and binding into training and test
        peptides_train, peptides_test = split_train_test(peptides,5)
        mhc_train, mhc_test = split_train_test(mhc,5)
        Y_train, Y_test = split_train_test(Y,5)

        #fit graph model
        graph = get_graph_from_hyperparameters(args.pred)
        graph.fit({'peptide':peptides_train, 'mhc':mhc_train, 'output': Y_train},
                    batch_size=32,
                    nb_epoch=12,
                    verbose = 0,
                    )

        #calculate metrics for each allele
        for allele in blind_allele_list:
            print i, allele

            predictions = read_blind_predictions(HOME_PATH + '/combined-test-data/'+ allele + '.csv')
            peptides = predictions.keys()

            preds = []
            meas = []

            for peptide in peptides:
                preds.append(make_prediction(peptide, allele_sequence_data[allele],graph))
                meas.append(predictions[peptide]['meas'])
            preds = np.array(preds)
            meas = np.array(meas)

            try:
                preds_allele[allele]+=preds/nb_iter
            except:
                preds_allele[allele]=preds/nb_iter

            actual_allele[allele] = meas


    #calculate average for all the alleles

    calculated_metrics = np.zeros(6)

    for allele in blind_allele_list:
        Y_pred_allele = max_ic50**(1-preds_allele[allele])
        Y_true_allele = actual_allele[allele]
        score_allele = scores(Y_true_allele, Y_pred_allele)
        calculated_metrics  += score_allele ##sum metrics for all alleles

    print calculated_metrics/len(blind_allele_list) #divide sum by number of alleles
def main():

    #prediction input either "conv", "ffn_concat", "ffn_mult"
    args = parser.parse_args()

    graph = get_graph_from_hyperparameters(args.pred)
    allele_sequence_data, max_allele_length = load_allele_sequence_data(SEQUENCE_DATA_PATH)

    predictors = ['mhcflurry','netmhcpan','netmhc','smmpmbec_cpp']
    #allele_list

    allele_list = ['A0101',	    'A0201',	'A0202',    'A0203',	'A0206',	'A0301',
                   'A1101',	    'A2301',	'A2402',	'A2501',	'A2601',    'A2602',
                   'A2603',	    'A2902',	'A3001',	'A3002',	'A3101',	'A3201',
                   'A3301',	    'A6801',	'A6802',	'A6901',    'A8001',	'B0702',
                   'B0801',	    'B0802',	'B0803',	'B1501',	'B1503',    'B1509',
                   'B1517',	    'B1801',	'B2703',    'B2705',    'B3501',	'B3801',
                   'B3901',	    'B4001',	'B4002',	'B4402',	'B4403',	'B4501',
                   'B4601',	    'B5101',    'B5301',	'B5401',	'B5701',	'B5801'	]


    #Load graph

    batch_size = 32
    epoch_range = map(int, args.epoch.split(','))

    for epoch in range(epoch_range[0],epoch_range[1]):

        graph.load_weights(HOME_PATH + '/weights' + str(args.max_ic50) + '/'  + args.pred + '/weights' + str(batch_size) + '_'  + str(epoch) )

        #Initializing
        data_len = sum(len(read_blind_predictions(HOME_PATH + '/combined-test-data/'+ allele + '.csv').keys()) for allele in allele_list)
        Y_true_all = np.zeros(data_len)
        total_metrics = collections.defaultdict(list)
        for val in predictors:
                total_metrics[val] =  np.zeros(data_len)

        pos  = 0
        calculated_metrics =collections.defaultdict(tuple)
        for val in predictors:
            calculated_metrics[val] = np.zeros(6)


        #calculating metrics per allele
        for allele in allele_list:

            filename = HOME_PATH + '/combined-test-data/'+ allele + '.csv'

            predictions = read_blind_predictions(filename)
            peptides = predictions.keys()

            for peptide in peptides:
                predictions[peptide]['mhcflurry'] = max_ic50**(1-make_prediction(peptide, allele_sequence_data[allele], graph))

            df_pred = pd.DataFrame(predictions)

            Y_true_allele = np.array(df_pred.loc['meas'])
            Y_true_all[pos:pos+len(peptides)] =  Y_true_allele

            if (args.allele_info == True):
                print "\n=====", allele, sum(Y_true_allele <= ic50_cutoff), len(Y_true_allele), "===="

            for val in predictors:

                Y_pred_allele = np.array(df_pred.loc[val])
                calculated_metrics[val]  += scores(Y_true_allele, Y_pred_allele)

                if (args.allele_info == True):
                    print val, scores(Y_true_allele, Y_pred_allele)

            pos +=len(peptides)

        print '\n',epoch,

        for val in predictors:
            calculated_metrics[val] = calculated_metrics[val]/len(allele_list)
            print val,',',','.join(map(str,calculated_metrics[val][1:]))
    peptides.append(np.random.randint(20, size=9))

allele_sequence_data, max_allele_length = load_allele_sequence_data(SEQUENCE_DATA_PATH)


allele_list = ['A0101',	    'A0201',	'A0202',    'A0203',	'A0206',	'A0301',
               'A1101',	    'A2301',	'A2402',	'A2501',	'A2601',    'A2602',
               'A2603',	    'A2902',	'A3001',	'A3002',	'A3101',	'A3201',
               'A3301',	    'A6801',	'A6802',	'A6901',    'A8001',	'B0702',
               'B0801',	    'B0802',	'B0803',	'B1501',	'B1503',    'B1509',
               'B1517',	    'B1801',	'B2703',    'B2705',    'B3501',	'B3801',
               'B3901',	    'B4001',	'B4002',	'B4402',	'B4403',	'B4501',
               'B4601',	    'B5101',    'B5301',	'B5401',	'B5701',	'B5801'	]


graph = get_graph_from_hyperparameters('ffn_mult')
for epoch in range(1,99):
    graph.load_weights(HOME_PATH + 'weights/ffn_mult/weights32_' + str(epoch))
    predictions = np.empty(len(allele_list)*nb_peptides)
    counter = 0
    for allele in allele_list:
        allele_sequence = mhc_seq = padded_indices([allele_sequence_data[allele]],
                                        add_start_symbol=False,
                                        add_end_symbol=False,
                                        index_dict=amino_acid_letter_indices)

        for peptide in peptides:
            predictions[counter] = 20000**(1-graph.predict({'peptide':[[peptide]],'mhc':mhc_seq})['output'])
            counter = counter + 1

    print epoch, np.sum(predictions<500)