for i,m in enumerate(actives[k]):
                fp_dict = scor.getFP(fp_build, m[1])
                actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict]

        # read in test actives and calculate fps
        div_actives = []
        for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'):
            if line[0] != '#': 
                # structure of line: [external ID, internal ID, SMILES]]
                line = line.rstrip().split()
                fp_dict = scor.getFP(fp_build, line[2])
                # store: [internal ID, dict with fps]
                div_actives.append([line[1], fp_dict])
        num_test_actives = conf.num_div_act - 1
        # convert fps to numpy arrays
        np_fps_div_act = ml_func.getNumpy(div_actives)

        # read in decoys and calculate fps
        if firstchembl:
            decoys = []
            for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz', 'r'):
                if line[0] != '#': 
                    # structure of line: [external ID, internal ID, SMILES]]
                    line = line.rstrip().split()
                    fp_dict = scor.getFP(fp_build, line[2])
                    # store: [internal ID, dict with fps]
                    decoys.append([line[1], fp_dict])
            # convert fps to numpy arrays
            np_fps_dcy = ml_func.getNumpy(decoys)
            firstchembl = False
            num_decoys = len(decoys)
Esempio n. 2
0
            # read in actives and calculate fps
            actives = []
            for line in gzip.open(
                    inpath_cmp + dataset + '/cmp_list_' + dataset + '_' +
                    str(target) + '_actives.dat.gz', 'r'):
                if line[0] != '#':
                    # structure of line: [external ID, internal ID, SMILES]]
                    line = line.rstrip().split()
                    fp_dict = scor.getFP(fp_build, line[2])
                    # store: [internal ID, dict with fps]
                    actives.append([line[1], fp_dict])
            num_actives = len(actives)
            num_test_actives = num_actives - num_query_mols
            # convert fps to numpy arrays
            np_fps_act = ml_func.getNumpy(actives)

            # read in decoys and calculate fps
            if dataset == 'ChEMBL':
                if firstchembl:
                    decoys = []
                    for line in gzip.open(
                            inpath_cmp + dataset + '/cmp_list_' + dataset +
                            '_zinc_decoys.dat.gz', 'r'):
                        if line[0] != '#':
                            # structure of line: [external ID, internal ID, SMILES]]
                            line = line.rstrip().split()
                            fp_dict = scor.getFP(fp_build, line[2])
                            # store: [internal ID, dict with fps]
                            decoys.append([line[1], fp_dict])
                    # convert fps to numpy arrays