Esempio n. 1
0
def main(para):
    if 'ListSize' not in para:
        para['ListSize'] = '-1'
    if 'ThreadNum' not in para:
        para['ThreadNum'] = '1'
    if 'RandomSeed' not in para:
        para['RandomSeed'] = '2014'
    if 'ModelName' not in para:
        para['ModelName'] = 'RF-bin'
    if 'ModelFile' not in para:
        para['ModelFile'] = 'set1_all_new.txt.fea.max.RF-bin'
    if 'DockMethod' not in para:
        para['FeatureType'] = 'SaveResidue'
    if 'SolutionNum' not in para:
        para['SolutionNum'] = '10'
    if 'PredictCutoff' not in para:
        para['PredictCutoff'] = '0.5'

    ## Step 1: docking all pdb chain pairs and train
    if not os.path.exists(para['ModelFile']):
        import cross_validation
        para1 = para.copy()
        para1['SplitFold'] = '1'
        cross_validation.main(para1)

    ## Step 2: docking new pairs and predict
    ## prepare pdb pair list
    pdblistfile = 'list_from_user.txt'
    with open(pdblistfile, 'w') as tempfile:
        tempfile.write('Hhp1\tTas3\t4HOK\tA\t3D1D\tA\n')
        #tempfile.write('Hhp1\tMoc3\t4HOK\tA\tMOC3_modbase\t \n')
        tempfile.write('Hhp1\tPpc89\t4HOK\tA\tPPC89_Modbase\t \n')

    ## docking them and generate features
    feature_file = 'features_for_predicting.txt'
    #if not os.path.exists(feature_file):
    if True:
        import prepare_feature
        para2 = para.copy()
        para2['ListFile'] = pdblistfile
        para2['OutFile'] = feature_file
        para2['ListFormat'] = 'p1/p2/pdb1/ch1/pdb2/ch2'
        prepare_feature.main(para2)

    ## 2.c: predict their contact probabilities
    from cross_validation import add_residue_label, model_predict
    add_residue_label(feature_file)
    predfile = model_predict(feature_file, model=para['ModelName'], mfile=para['ModelFile'])
    
    ## Step 3: get predicted values for each residue
    from cross_validation import map_pdb_residue
    residue_value = map_pdb_residue(predfile)
    with open(para['ExeFile']+'data.txt', 'w') as outfile:
        for g1g2, res, val in residue_value:
            outfile.write('%s\t%s\t-1\t%s\n'%('\t'.join(g1g2.split('=')), res, val))
    from evaluate_perform import read_residue_data, group_residue
    idx, val1, val2 = read_residue_data(para['ExeFile']+'data.txt')
    pp_val = group_residue(idx, val2) ## using predicted value

    for p1, p2 in pp_val:
        res = pp_val[(p1,p2)]
        pp = list(set([r.split(':')[0] for r in res]))
        if True: ## reformat protein names
            if p1 == p2 and len(pp) == 1:
                p1 = pp
                p2 = pp
            elif len(pp) == 2:
                p1 = pp[0]
                p2 = pp[1]
            else:
                print 'Failed to map', p1, p2, pp
        show(p1)
        show(p2)
        int1 = [r for r in res if r.split(':')[0]==p1 and res[r] >= float(para['PredictCutoff'])]
        int2 = [r for r in res if r.split(':')[0]==p2 and res[r] >= float(para['PredictCutoff'])]
        ord1 = sorted([int(r.split(':')[-1]) for r in int1])
        ord2 = sorted([int(r.split(':')[-1]) for r in int2])
        show(','.join([str(i) for i in ord1]))
        show(','.join([str(i) for i in ord2]))
        show()
Esempio n. 2
0
def main(para):
    if 'ListFile' not in para:
        para['ListFile'] = para['DataPath']+'/set1_all_new.txt' 
    if 'SplitFold' not in para:
        para['SplitFold'] = '10'
    if 'SolutionNum' not in para:
        para['SolutionNum'] = '10'
    if 'RandomSeed' not in para:
        para['RandomSeed'] = '2014'
    if 'ThreadNum' not in para:
        para['ThreadNum'] = '1'
    if 'FeatureType' not in para:
        para['FeatureType'] = 'SaveResidue'
    if 'ModelName' not in para:
        para['ModelName'] = 'RF-bin'
    train_list = [] ## unique pdb list for training
    real_value = [] ## from cocrystal
    pred_value = [] ## from learning model
    other_vals = {}
    if para['FeatureType'] == 'SaveResidue':
        other_vals = {2:[], 3:[], 4:[], 5:[]}
#    elif para['FeatureType'] == 'SaveZDOCK':
#        other_vals = {2:[], 3:[], 4:[]}
    elif para['FeatureType'] == 'SavePatchDock':
        other_vals = {2:[], 3:[], 4:[]}
    elif para['FeatureType'] == 'SaveSequence':
        other_vals = {2:[], 3:[], 4:[]}

    all_res, pp_int = get_res_labels(para)
    import prepare_feature
    for train, test in split_data(para['ListFile'], int(para['SplitFold']), int(para['RandomSeed'])):
        if para['SplitFold'] == '1':
            train = para['ListFile']
            test = para['ListFile']
        if False: ## compare DDI network
            from generate_hSIN import generate_ddi2, get_pdb_subset
            generate_ddi2(output_file = train + '.ddi',
                          pdb_subset = get_pdb_subset(train))
            generate_ddi2(output_file = test + '.ddi',
                          pdb_subset = get_pdb_subset(test))
            from domain_map import reduced_ddi
            train_ddi = reduced_ddi(train + '.ddi')
            test_ddi = reduced_ddi(test + '.ddi')
        ###############################################################
        ## Train
        show(train, False)
        para2 = para.copy() ## copy parameters
        para2['ListFile'] = train
        para2['ListSize'] = '-1'
        para2['ListFormat'] = 'p1/p2/pdb1/ch1/pdb2/ch2'
        prepare_feature.main(para2)
        resfile = combine_pdb_residue(para2['OutFile'])
        #resfile = add_more_info(resfile, train_ddi)
        show(add_residue_label(resfile, pp_int), False)
        mfile = train_model(resfile, model=para['ModelName'])

        ###############################################################
        ## Test
        show(test, False)
        para2['ListFile'] = test
        prepare_feature.main(para2)
        resfile = combine_pdb_residue(para2['OutFile'])
        #resfile = add_more_info(resfile, train_ddi)
        show(add_residue_label(resfile, None), False)
        outfile = model_predict(resfile, model=para['ModelName'], mfile=mfile)

        ## Save values
        if not os.path.exists(outfile):
            continue ## skip this fold
        pred_value += map_pdb_residue(outfile, para2['MapFile'])
        for idx in other_vals:
            values = other_vals[idx]
            values += map_pdb_residue(resfile, para2['MapFile'], useidx=idx)
            other_vals[idx] = values
        show()
        if para['SplitFold'] != '1':
            ## clean files
            os.system('rm %s*'%train)
            os.system('rm %s*'%test)
    ## Get the real labels of residues in predicted protein pairs
    pred_pp = set([pp for pp, res, val in pred_value])
    show('Performance based on %s protein pairs'%len(pred_pp))
    for pp, res, val in all_res:
        if pp in pred_pp:
            real_value.append((pp, res, val))
        
    ## Comparison between know_value and pred_value agaist real_value
    save_list = [('cocry', real_value), ('pred'+'-'+para['FeatureType']+'-'+para['ModelName'], pred_value)]
    for idx in other_vals:
        save_list.append((para['FeatureType']+'-idx%s'%idx, other_vals[idx]))
    for name, value in save_list:
        head = []; real = []; pred = [];
        for idx, res, val1, val2 in intersect_poslist(real_value, value):
            p1, p2 = idx.split('=')
            head.append([p1, p2, res])
            real.append(val1)
            pred.append(val2)
        with open('cv_cocry_'+name+'_'+para['RandomSeed']+'.txt', 'w') as tempfile:
            for _a, _b, _c in zip(head, real, pred):
                tempfile.write('%s\t%s\t%s\n'%('\t'.join(_a), _b, _c))
        ## evaluate
        show(name, False)
        area, px, py, pc = performance(real, pred, x='FPR', y='TPR')
        show(area, False)
        #show(); show('FPR'); show(px, True); 
        #show('TPR'); show(py, True);
        area, px, py, pc = performance(real, pred, x='TPR', y='PPV')
        show(area, False)
        #show(); show('Recall'); show(px, True); 
        #show('Precision'); show(py, True);
        #show('Cutoff'); show(pc, True);
        show()