Ejemplo n.º 1
0
Archivo: utils.py Proyecto: nrnb/EPIC
def Goldstandard_from_cluster_File(gsF, foundprots=""):
    clusters = GS.Clusters(need_to_be_mapped=False)
    clusters.read_file(gsF)
    if foundprots != "": clusters.remove_proteins(foundprots)
    gs = GS.Goldstandard_from_Complexes("All")
    gs.complexes = clusters
    gs.make_pos_neg_ppis()
    return gs
Ejemplo n.º 2
0
Archivo: utils.py Proyecto: nrnb/EPIC
def create_goldstandard(clusters, target_taxid, valprots):
    if target_taxid != "9606" and target_taxid != "":
        orthmap = GS.Inparanoid(taxid=target_taxid)
    else:
        orthmap = ""

    gs = GS.Goldstandard_from_Complexes("Goldstandard")
    gs.make_reference_data(clusters, orthmap, found_prots=valprots)
    return gs
Ejemplo n.º 3
0
def orth_map(args):
    clusterF, taxid, outF = args

    clust = GS.Clusters(False)
    clust.read_file(clusterF)

    orthmap = GS.Inparanoid(taxid=taxid)
    orthmap.mapComplexes(clust)

    clust.merge_complexes()
    clust.filter_complexes()

    outFH = open(outF, "w")
    outFH.write(clust.to_string())
    outFH.close()
Ejemplo n.º 4
0
def make_eval(args):
    #	pred_clust_F, ref_clust_F, ppiF, cutoff, outF = args

    pred_clust_F, ref_clust_F = args

    #num_ppis = CS.lineCount(ppiF)
    pred_clusters = GS.Clusters(False)
    pred_clusters.read_file(pred_clust_F)

    ref_clusters = GS.Clusters(False)
    ref_clusters.read_file(ref_clust_F)

    #	utils.clustering_evaluation(train.complexes, pred_clusters, "Train", True)
    scores, head = utils.clustering_evaluation(ref_clusters, pred_clusters, "",
                                               True)
Ejemplo n.º 5
0
Archivo: utils.py Proyecto: nrnb/EPIC
def load_data(data, scores, orthmap="", fc=2, mfc=1):

    if type(data) is list:
        paths = data
    else:
        paths = [os.path.join(data, fn) for fn in next(os.walk(data))[2]]

    elutionDatas = []
    elutionProts = set([])
    for elutionFile in paths:
        if elutionFile.rsplit(os.sep, 1)[-1].startswith("."): continue
        elutionFile = elutionFile.rstrip()
        elutionData = CS.ElutionData(elutionFile,
                                     frac_count=fc,
                                     max_frac_count=mfc)
        if orthmap != "":
            if orthmap != False:
                mapper = GS.Inparanoid("", inparanoid_cutoff=1)
                mapper.readTable(orthmap, direction=0)
                elutionData.orthmap(mapper)
        elutionDatas.append(elutionData)
        elutionProts = elutionProts | set(elutionData.prot2Index.keys())
        for score in scores:
            score.init(elutionData)
    return elutionProts, elutionDatas
Ejemplo n.º 6
0
def rf_cutoff(args):
    pred_clust_F, ref_clust_F, ppiF, cutoff, outF = args

    num_ppis = CS.lineCount(ppiF)
    pred_clusters = GS.Clusters(False)
    pred_clusters.read_file(pred_clust_F)

    ref_clusters = GS.Clusters(False)
    ref_clusters.read_file(ref_clust_F)

    #	utils.clustering_evaluation(train.complexes, pred_clusters, "Train", True)
    scores, head = utils.clustering_evaluation(ref_clusters, pred_clusters, "",
                                               True)

    outFH = open(outF, "w")
    outFH.write("%s\t%i\t%i\t%s\n" %
                (cutoff, num_ppis, len(pred_clusters.complexes), scores))
    outFH.close()
Ejemplo n.º 7
0
def Goldstandard_from_PPI_File(gsF, foundprots=""):
    out = GS.Goldstandard_from_Complexes("gs")
    gsFH = open(gsF)
    for line in gsFH:
        line = line.rstrip()
        ida, idb, class_label = line.split("\t")[0:3]
        if foundprots != "" and (ida not in foundprots
                                 or idb not in foundprots):
            continue
        edge = "\t".join(sorted([ida, idb]))
        if class_label == "positive":
            out.positive.add(edge)
        else:
            out.negative.add(edge)
    gsFH.close()
    return out
Ejemplo n.º 8
0
Archivo: utils.py Proyecto: nrnb/EPIC
def get_reference_from_net(target_taxid):
    if target_taxid != "9606":
        reference_clusters = [
            GS.Intact_clusters(True),
            GS.CORUM(True),
            GS.QuickGO("9606", True),
            GS.QuickGO(target_taxid, False)
        ]
    else:
        reference_clusters = [
            GS.Intact_clusters(False),
            GS.CORUM(False),
            GS.QuickGO("9606", False)
        ]
    return reference_clusters
Ejemplo n.º 9
0
def cut(args):
    fc, scoreF, outF = args
    if fc == "00000000": sys.exit()
    this_scores = get_fs_comb(fc)
    scoreCalc = CS.CalculateCoElutionScores("", "", "", "", cutoff=0.5)
    empty_gs = GS.Goldstandard_from_Complexes()
    empty_gs.positive = set([])
    empty_gs.negative = set([])
    scoreCalc.readTable(scoreF, empty_gs)
    print scoreCalc.to_predict
    feature_comb = feature_selector([fs.name for fs in this_scores], scoreCalc)
    feature_comb.open()
    outFH = open(outF, "w")
    print >> outFH, "\t".join(feature_comb.scoreCalc.header)
    for i in range(feature_comb.to_predict):
        edge, edge_scores = feature_comb.get_next()
        if edge == "" or edge_scores == []: continue
        print >> outFH, "%s\t%s" % (edge, "\t".join(map(str, edge_scores)))
    outFH.close()
    feature_comb.close()
Ejemplo n.º 10
0
Archivo: utils.py Proyecto: nrnb/EPIC
def stability_evaluation(n_fold, all_gs, scoreCalc, clf, output_dir, mode,
                         anno_source, anno_F):

    tmp_train_eval_container = (all_gs.split_into_n_fold2(
        n_fold, set(scoreCalc.ppiToIndex.keys()))["turpleKey"])

    #create the dictionary to store the predicted PPIs
    PPIs_dict_for_each_fold = {}

    #create the dictionary to store the predicted complexes
    complexes_dict_for_each_fold = {}

    for index in range(n_fold):

        train, eval = tmp_train_eval_container[index]

        print "All comp:%i" % len(all_gs.complexes.complexes)
        print "Train comp:%i" % len(train.complexes.complexes)
        print "Eval comp:%i" % len(eval.complexes.complexes)

        print "Num valid ppis in training pos: %i" % len(train.positive)
        print "Num valid ppis in training neg: %i" % len(train.negative)
        print "Num valid ppis in eval pos: %i" % len(eval.positive)
        print "Num valid ppis in eval neg: %i" % len(eval.negative)

        # Evaluate classifier
        bench_clf(scoreCalc, train, eval, clf, output_dir, verbose=True)

        functionalData = ""
        if mode != "exp":
            functionalData = get_FA_data(anno_source, anno_F)
            print functionalData.scores.shape

        print "the functional evidence data shape is: "

        # Predict protein interaction based on n_fold cross validation
        network = make_predictions(scoreCalc,
                                   "exp",
                                   clf,
                                   train,
                                   fun_anno="",
                                   verbose=False)

        # need to write the network into a file for later-on complexes prediction.
        outFH = open("%s.%s.pred.txt" % (output_dir, mode + anno_source), "w")
        print >> outFH, "\n".join(network)
        outFH.close()

        PPIs_dict_for_each_fold[index] = set(get_network_edges(network))

        #predicted_clusters from the predicted PPI network
        predict_clusters("%s.%s.pred.txt" % (output_dir, mode + anno_source),
                         "%s.%s.clust.txt" % (output_dir, mode + anno_source))

        pred_clusters = GS.Clusters(False)
        pred_clusters.read_file("%s.%s.clust.txt" %
                                (output_dir, mode + anno_source))

        complexes_dict_for_each_fold[index] = pred_clusters

        print "fold " + str(index + 1) + "is done"

    #create a matrix for storing overlapped matrix, each element in the matrix is a zero.
    overlapped_ratio_matrix_PPIs = np.zeros((n_fold, n_fold))
    overlapped_ratio_matrix_complexes = np.zeros((n_fold, n_fold))

    for i in range(0, n_fold):
        for j in range(0, n_fold):

            overlapped_ratio_matrix_PPIs[i, j] = (len(
                PPIs_dict_for_each_fold[i] & PPIs_dict_for_each_fold[j])) / (
                    (len(PPIs_dict_for_each_fold[i]) +
                     len(PPIs_dict_for_each_fold[j])) / 2)

            # calculate the overlapped complexes numbers from both direction and then get the avergae of them
            overlapped_no1 = complexes_dict_for_each_fold[i].getOverlapp(
                complexes_dict_for_each_fold[j], cutoff=0.25)
            overlapped_no2 = complexes_dict_for_each_fold[j].getOverlapp(
                complexes_dict_for_each_fold[i], cutoff=0.25)

            averaged_overlapped_complexes_no = (overlapped_no1 +
                                                overlapped_no2) / 2

            overlapped_ratio_matrix_complexes[
                i, j] = averaged_overlapped_complexes_no / (
                    (len(complexes_dict_for_each_fold[i].get_complexes()) +
                     len(complexes_dict_for_each_fold[j].get_complexes())) / 2)

    print overlapped_ratio_matrix_PPIs
    print overlapped_ratio_matrix_complexes

    # create the txt file to save the overlap matrix for stabilit testing.
    filename1 = output_dir + " n_fold_corss_validation_PPIs overlap matrix.txt"
    filename2 = output_dir + " n_fold_corss_validation_complexes overlap matrix.txt"

    np.savetxt(filename1, overlapped_ratio_matrix_PPIs, delimiter='\t')
    np.savetxt(filename2, overlapped_ratio_matrix_complexes, delimiter='\t')
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--feature_selection",
        type=str,
        help=
        "Select which features to use. This is an 8 position long array of 0 and 1, where each position determines which co-elution feature to use. Features sorted by position are: MI, Bayes, Euclidean, WCC, Jaccard, PCCN, PCC, and Apex.  Each default=11101001",
        default="11101001")
    parser.add_argument(
        "input_dir",
        type=str,
        help="Directory containing the elution files for each experiment")

    parser.add_argument(
        "-t",
        "--taxid",
        type=str,
        help="TAXID to automatically download reference from GO,CORUM,INtACT",
        default="")
    parser.add_argument(
        "-c",
        "--cluster",
        type=str,
        help="Path to file containing protein clsuter reference",
        default="")
    parser.add_argument("-p",
                        "--ppi",
                        type=str,
                        help="path to ppi File",
                        default="")

    parser.add_argument("output_dir",
                        type=str,
                        help="Directory containing the output files")
    parser.add_argument("-o",
                        "--output_prefix",
                        type=str,
                        help="Prefix name for all output Files",
                        default="Out")

    parser.add_argument(
        "-M",
        "--classifier",
        type=str,
        help="Select which classifier to use. Values: RF SVM, default RF",
        default="RF")
    parser.add_argument("-n",
                        "--num_cores",
                        type=int,
                        help="Number of cores to be used, default 1",
                        default=1)

    parser.add_argument(
        "-m",
        "--mode",
        type=str,
        help=
        "Run EPIC with experimental, functional, or both evidences. Values: EXP, FA, COMB, default: EXP  ",
        default="EXP")
    parser.add_argument(
        "-f",
        "--fun_anno_source",
        type=str,
        help=
        "Where to get functional annotaiton from. Values: STRING or GM or FILE, default= GM",
        default="GM")
    parser.add_argument(
        "-F",
        "--fun_anno_file",
        type=str,
        help=
        "Path to File containing functional annotation. This flag needs to be set when using FILE as fun_anno_source.",
    )
    parser.add_argument("-r",
                        "--co_elution_cutoff",
                        type=float,
                        help="Co-elution score cutoff. default 0.5",
                        default=0.5)
    parser.add_argument(
        "-R",
        "--classifier_cutoff",
        type=float,
        help="Classifier confidence valye cutoff. default = 0.5",
        default=0.5)
    parser.add_argument(
        "-e",
        "--elution_max_count",
        type=int,
        help=
        "Removies protein that have a maximal peptide count less than the given value. default = 1",
        default=1)
    parser.add_argument(
        "-E",
        "--frac_count",
        type=int,
        help=
        "Number of fracrions a protein needs to be measured in. default = 2",
        default=2)

    parser.add_argument(
        "-P",
        "--precalcualted_score_file",
        type=str,
        help=
        "Path to precalulated scorefile to read scores from for faster rerunning of EPIC. default = None",
        default="NONE")

    args = parser.parse_args()

    args.mode = args.mode.upper()
    args.fun_anno_source = args.fun_anno_source.upper()

    #Create feature combination
    if args.feature_selection == "00000000":
        print "Select at least one feature"
        sys.exit()

    this_scores = utils.get_fs_comb(args.feature_selection)
    print "\t".join([fs.name for fs in this_scores])

    # Initialize CLF
    use_rf = args.classifier == "RF"
    clf = CS.CLF_Wrapper(args.num_cores, use_rf)

    # Load elution data
    foundprots, elution_datas = utils.load_data(args.input_dir,
                                                this_scores,
                                                fc=args.frac_count,
                                                mfc=args.elution_max_count)

    # Generate reference data set
    gs = ""
    if ((args.taxid != "" and args.ppi != "")
            or (args.cluster != "" and args.ppi != "")):
        print "Refernce from cluster and PPI are nor compatiple. Please supply ppi or complex reference, not both!"
        sys.exit()

    if args.taxid == "" and args.ppi == "" and args.cluster == "":
        print "Please supply a reference by setting taxid, cluster, or ppi tag"
        sys.exit()

    gs_clusters = []
    if (args.taxid != "" and args.cluster == "" and args.ppi == ""):
        print "Loading clusters from GO, CORUM, and Intact"
        gs_clusters.extend(utils.get_reference_from_net(args.taxid))

    if args.cluster != "":
        print "Loading complexes from file"
        if args.mode == "FA":
            gs_clusters.append(GS.FileClusters(args.cluster, "all"))
        else:
            gs_clusters.append(GS.FileClusters(args.cluster, foundprots))

    if args.ppi != "":
        print "Reading PPI file from %s" % args.reference
        gs = Goldstandard_from_PPI_File(args.ppi, foundprots)

    print gs_clusters
    if len(gs_clusters) > 0:
        gs = utils.create_goldstandard(gs_clusters, args.taxid, foundprots)

    output_dir = args.output_dir + os.sep + args.output_prefix

    refFH = open(output_dir + ".ref_complexes.txt", "w")
    for comp in gs.complexes.complexes:
        print >> refFH, "%s\t%s" % (",".join(comp), ",".join(
            gs.complexes.complexes[comp]))
    refFH.close()

    scoreCalc = CS.CalculateCoElutionScores(this_scores,
                                            elution_datas,
                                            output_dir + ".scores.txt",
                                            num_cores=args.num_cores,
                                            cutoff=args.co_elution_cutoff)
    if args.precalcualted_score_file == "NONE":
        scoreCalc.calculate_coelutionDatas(gs)
    else:
        scoreCalc.readTable(args.precalcualted_score_file, gs)

    print scoreCalc.scores.shape

    functionalData = ""
    gs.positive = set(gs.positive & set(scoreCalc.ppiToIndex.keys()))
    gs.negative = set(gs.negative & set(scoreCalc.ppiToIndex.keys()))
    gs.rebalance()

    print len(gs.positive)
    print len(gs.negative)

    if args.mode != "EXP":
        print "Loading functional data"
        functionalData = utils.get_FA_data(args.fun_anno_source, args.taxid,
                                           args.fun_anno_file)
        print "Dimension of fun anno " + str(functionalData.scores.shape)

    print "Start benchmarking"

    if args.mode == "EXP":
        utils.cv_bench_clf(scoreCalc,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    if args.mode == "COMB":
        tmp_sc = copy.deepcopy(scoreCalc)
        tmp_sc.add_fun_anno(functionalData)
        utils.cv_bench_clf(tmp_sc,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    if args.mode == "FA":
        utils.cv_bench_clf(functionalData,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    # PPI evaluation
    print utils.cv_bench_clf(scoreCalc,
                             clf,
                             gs,
                             args.output_dir,
                             verbose=False,
                             format="pdf",
                             folds=5)
    #print "I am here"

    network = utils.make_predictions(scoreCalc,
                                     args.mode,
                                     clf,
                                     gs,
                                     fun_anno=functionalData)

    # Predict protein interaction
    outFH = open("%s.pred.txt" % (output_dir), "w")

    final_network = []
    for PPI in network:
        items = PPI.split("\t")
        if float(items[2]) >= args.classifier_cutoff:
            final_network.append(PPI)

    print >> outFH, "\n".join(final_network)
    outFH.close()

    # Predicting clusters
    utils.predict_clusters("%s.pred.txt" % (output_dir),
                           "%s.clust.txt" % (output_dir))

    # Evaluating predicted clusters
    pred_clusters = GS.Clusters(False)
    pred_clusters.read_file("%s.clust.txt" % (output_dir))
    overlapped_complexes_with_reference = gs.get_complexes(
    ).get_overlapped_complexes_set(pred_clusters)
    print "# of complexes in reference dataset: " + str(
        len(overlapped_complexes_with_reference))
    #clust_scores, header = utils.clustering_evaluation(gs.complexes, pred_clusters, "", False)
    clust_scores, header, composite_score = utils.clustering_evaluation(
        gs.complexes, pred_clusters, "", False)
    outFH = open("%s.eval.txt" % (output_dir), "w")
    header = header.split("\t")
    clust_scores = clust_scores.split("\t")
    for i, head in enumerate(header):
        print "%s\t%s" % (head, clust_scores[i])
        print >> outFH, "%s\t%s" % (head, clust_scores[i])
    outFH.close()
Ejemplo n.º 12
0
def n_fold_cross_validation(n_fold, all_gs, scoreCalc, clf, output_dir,
                            overlap, local):
    out_scores = []
    out_head = []
    header = [
        "Num_pred_PPIS", "NUM_pred_CLUST", "mmr", "overlapp", "simcoe",
        "mean_simcoe_overlap", "sensetivity", "ppv", "accuracy", "sep"
    ]

    train_eval_container = all_gs.n_fols_split(n_fold, overlap)

    # create a matrix to store the computed complexes vealuation metrics
    complex_eval_score_vector = np.zeros((n_fold, 10))
    val_ppis = set(scoreCalc.ppiToIndex.keys())

    print "Number of ppis with e-score>0.5: %i" % len(val_ppis)
    #the global cluster will contain all clusters predcited from n-fold-corss validation
    for index in range(n_fold):
        print "processinng fold " + str(index + 1)
        train, eval = train_eval_container[index]

        train.positive = train.positive & val_ppis
        train.negative = train.negative & val_ppis
        train.rebalance()

        print "All comp:%i" % len(all_gs.complexes.complexes)
        print "Train comp:%i" % len(train.complexes.complexes)
        print "Eval comp:%i" % len(eval.complexes.complexes)
        print "Num valid ppis in training pos: %i" % len(train.positive
                                                         & val_ppis)
        print "Num valid ppis in training neg: %i" % len(train.negative
                                                         & val_ppis)
        print "Num valid ppis in eval pos: %i" % len(eval.positive)
        print "Num valid ppis in eval neg: %i" % len(eval.negative)

        print "Overlap positive %i" % (len(train.positive & eval.positive))
        print "Overlap negative %i" % (len(train.negative & eval.negative))

        network = []
        if local:
            # Predict protein interaction based on n_fold cross validation
            network = utils.make_predictions_cross_validation(
                scoreCalc, train, eval, clf)

        else:
            network = utils.predictInteractions(scoreCalc,
                                                clf,
                                                train,
                                                verbose=True)

        netF = "%s.fold_%s.pred.txt" % (output_dir, index)
        clustF = "%s.fold_%s.clust.txt" % (output_dir, index)

        #if os.path.isfile(netF):
        #	netFH = open(netF)
        #	for line in netFH:
        #		line = line.rstrip()
        #		network.append(line)
        #	netFH.close()

        fold_head = []

        if len(network) == 0:
            print "No edges were predicted"
            tmp_scores = [0] * 10
            fold_head = "\t".join(
                ["%s%s" % ("Fold %i " % (index + 1), h) for h in header])
            out_head.append(fold_head)
            out_scores.append("\t".join(map(str, tmp_scores)))
            complex_eval_score_vector[index, :] = tmp_scores
            continue

        tmp = []
        for ppi in network:
            prota, protb, score = ppi.split("\t")
            if float(score) > 0.5:  # this is random forest confidence cut off
                tmp.append(ppi)
        network = tmp

        outFH = open(netF, "w")
        print >> outFH, "\n".join(network)
        outFH.close()

        # Predicting clusters
        utils.predict_clusters(netF, clustF)

        # Evaluating predicted clusters
        pred_clusters = GS.Clusters(False)
        pred_clusters.read_file(clustF)

        print "number of complexes"
        print len(pred_clusters.get_complexes())

        print "number of ppis"
        print len(network)

        fold_scores, fold_head = utils.clustering_evaluation(
            eval.complexes, pred_clusters, "Fold %i " % (index + 1), True)
        out_scores.append(
            "%i\t%i\t%s" %
            (len(network), len(pred_clusters.get_complexes()), fold_scores))
        out_head.append("\t".join(
            ["%s%s" % ("Fold %i " % (index + 1), h) for h in header]))

        tmp_scores = [len(network), len(pred_clusters.get_complexes())]
        tmp_scores.extend(map(float, fold_scores.split("\t")))
        tmp_scores = np.array(tmp_scores)
        complex_eval_score_vector[index, :] = tmp_scores

    averaged_complex_eval_metrics_vector = np.mean(complex_eval_score_vector,
                                                   axis=0)

    out_scores.append("\t".join(map(str,
                                    averaged_complex_eval_metrics_vector)))
    mean_head = "\t".join(["%s%s" % ("Mean ", h) for h in header])

    out_head.append(mean_head)
    return "\t".join(out_scores), "\t".join(out_head)