scores = defaultdict(list)

            # loop over repetitions
            for q in range(conf.num_reps):
                training_list = cPickle.load(training_input)
                test_list = [i for i in range(num_actives) if i not in training_list[:num_query_mols]]
                test_list += [i for i in range(num_decoys) if i not in training_list[num_query_mols:]]
                # loop over fps
                single_score = defaultdict(list)
                for fp in fp_names:
                    query_fps = [actives[i][1][fp] for i in training_list[:num_query_mols]]
                    # test_list: first actives then decoys
                    test_fps = [[actives[i][0], actives[i][1][fp], 1] for i in test_list[:num_test_actives]]
                    test_fps += [[decoys[i][0], decoys[i][1][fp], 0] for i in test_list[num_test_actives:]]
                    for tmp_mol in test_fps:
                        tmp_score = scor.getBulkSimilarity(tmp_mol[1], query_fps, simil_metric)
                        # use max fusion
                        # store : [similarity, internal ID, active/inactive]
                        single_score[fp].append([tmp_score[0], tmp_mol[0], tmp_mol[2]]) 
                    # rank list according to similarity
                    scores[fp].append(sorted(single_score[fp], reverse=True))

            # write scores to file
            if do_append:
                outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'ab+') # binary format
            else:
                outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format
            for fp in fp_names:
                cPickle.dump([fp, scores[fp]], outfile, 2)
            outfile.close()
            print "scoring done and scored lists written"
Ejemplo n.º 2
0
                ]
                np_test_fps = [
                    np_fps_act[i] for i in test_list[:num_test_actives]
                ]
                np_test_fps += [
                    np_fps_dcy[i] for i in test_list[num_test_actives:]
                ]
                test_mols = [[actives[i][0], 1]
                             for i in test_list[:num_test_actives]]
                test_mols += [[decoys[i][0], 0]
                              for i in test_list[num_test_actives:]]

                # calculate similarity with standard fp
                std_simil = []
                for fp in test_fps:
                    tmp_simil = scor.getBulkSimilarity(fp, train_fps,
                                                       simil_metric)
                    tmp_simil.sort(reverse=True)
                    std_simil.append(tmp_simil[0])

                # rank based on probability (and second based on similarity)
                single_score = ml.predict_proba(np_test_fps)
                # store: [probability, similarity, internal ID, active/inactive]
                single_score = [[
                    m[1], s, t[0], t[1]
                ] for m, s, t in zip(single_score, std_simil, test_mols)]
                single_score.sort(reverse=True)
                scores['rf_' + fp_build].append(single_score)

            # write scores to file
            if do_append:
                outfile = gzip.open(outpath + '/list_' + dataset + '_' +
                    if i not in training_list[num_query_mols:]
                ]
                # loop over fps
                single_score = defaultdict(list)
                for fp in fp_names:
                    query_fps = [
                        actives[i][1][fp]
                        for i in training_list[:num_query_mols]
                    ]
                    # test_list: first actives then decoys
                    test_fps = [[actives[i][0], actives[i][1][fp], 1]
                                for i in test_list[:num_test_actives]]
                    test_fps += [[decoys[i][0], decoys[i][1][fp], 0]
                                 for i in test_list[num_test_actives:]]
                    for tmp_mol in test_fps:
                        tmp_score = scor.getBulkSimilarity(
                            tmp_mol[1], query_fps, simil_metric)
                        # use max fusion
                        # store : [similarity, internal ID, active/inactive]
                        single_score[fp].append(
                            [tmp_score[0], tmp_mol[0], tmp_mol[2]])
                    # rank list according to similarity
                    scores[fp].append(sorted(single_score[fp], reverse=True))

            # write scores to file
            if do_append:
                outfile = gzip.open(outpath + '/list_' + dataset + '_' +
                                    str(target) + '.pkl.gz',
                                    'ab+')  # binary format
            else:
                outfile = gzip.open(outpath + '/list_' + dataset + '_' +
                                    str(target) + '.pkl.gz',
                np_train_fps += [np_fps_dcy[i] for i in training_list[num_query_mols:]]
                # fit random forest
                ml.fit(np_train_fps, ys_fit)

                # test fps and molecule info
                test_fps = [actives[i][1] for i in test_list[:num_test_actives]]
                test_fps += [decoys[i][1] for i in test_list[num_test_actives:]]
                np_test_fps = [np_fps_act[i] for i in test_list[:num_test_actives]]
                np_test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]]
                test_mols = [[actives[i][0], 1] for i in test_list[:num_test_actives]]
                test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]]

                # calculate similarity with standard fp
                std_simil = []
                for fp in test_fps:
                    tmp_simil = scor.getBulkSimilarity(fp, train_fps, simil_metric)
                    tmp_simil.sort(reverse=True)
                    std_simil.append(tmp_simil[0])

                # rank based on probability (and second based on similarity)
                single_score = ml.predict_proba(np_test_fps)
                # store: [probability, similarity, internal ID, active/inactive]
                single_score = [[m[1], s, t[0], t[1]] for m,s,t in zip(single_score,std_simil,test_mols)]
                single_score.sort(reverse=True)
                scores['rf_'+fp_build].append(single_score)

            # write scores to file
            if do_append:
                outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'ab+') # binary format
            else:
                outfile = gzip.open(outpath+'/list_'+dataset+'_'+str(target)+'.pkl.gz', 'wb+') # binary format