Beispiel #1
0
def roc_data(df, score_col="score", activity_col="activity"):
    rs = sd.RankStatistics(scores=list(
        zip(list(df[score_col]), list(df[activity_col]))),
                           activity_column=operator.itemgetter(1))

    # ROC
    tpr, fpr = rs.ROC()
    df["tpr"] = tpr
    df["fpr"] = fpr
    return df
Beispiel #2
0
def enrichment(df):
    rs = sd.RankStatistics(scores=list(
        zip(list(df['Score']), list(df['actives']))),
                           activity_column=operator.itemgetter(1))

    metric_df = pd.DataFrame({
        "AUC": [rs.AUC()],
        "EF1": [rs.EF(fraction=0.01)],
        "EF5": [rs.EF(fraction=0.05)],
        "EF10": [rs.EF(fraction=0.1)],
        "BEDROC16": [rs.BEDROC(alpha=16.1)],
        "BEDROC8": [rs.BEDROC(alpha=8)]
    })
    return metric_df
def rocplot(data, a_col=1, fname='roc.png'):
    """
    Create a ROC Curve using seaborn

    :param lists data: supply ranked data as list of list
    :return: None
    """
    rs = sd.RankStatistics(scores=data,
                           activity_column=operator.itemgetter(a_col))
    tpr, fpr = rs.ROC()
    ax = sns.lineplot(x=fpr, y=tpr, estimator=None, color="#c75048")
    ax = sns.lineplot(x=[0, 1], y=[0, 1], color="grey")
    ax.set(xlabel='FPR',
           ylabel='TPR',
           title=f"ROC Curve (AUC: {rs.AUC():.2f})")

    plt.savefig(fname)
    plt.close()
def main():
    rows = read_csv('/home/amukhopadhyay/ligand_screener_testing/screening_scores.csv')
    scores = []
    for row in rows:
        scores.append([row[0], int(row[1])])

    #print(scores) rdkit methods
    #fractions = [0.01, 0.05, 0.1]
    #print(Scoring.CalcAUC(scores, 1))
    #print(Scoring.CalcBEDROC(scores, 1, 20))
    #print(Scoring.CalcEnrichment(scores, 1, fractions))
    #print(Scoring.CalcRIE(scores, 1, 20))
    #print((Scoring.CalcAUC(scores, 1)))
    #print((Scoring.CalcROC(scores, 1)))

    rank_stats = StatisticalDescriptors.RankStatistics(scores, activity_column=operator.itemgetter(1))
    print(round(rank_stats.EF(0.01), 1))
    print(round(rank_stats.EF(0.02), 1))
    print(round(rank_stats.EF(0.05), 1))
    print(round(rank_stats.EF(0.1), 1))
    print(round(rank_stats.AUC(), 1))
    print(round(rank_stats.BEDROC(alpha=20), 1))
    print(round(rank_stats.RIE(alpha=20), 1))


    fpr, tpr = Scoring.CalcROC(scores, 1)
    roc_auc = metrics.auc(fpr, tpr)


    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig('test_roc.png')
Beispiel #5
0
    runs = [0, 10, 100]
    erich_levels = [0.01, 0.05, 0.1]
    frames = []

    weight = []
    ef = []
    efl = []

    for r in runs:
        f = os.path.join(base, "docking/{}/bestranking.lst".format(r))
        df = process_data(f)

        new = zip(list(df["Ligand name"]), list(df["Score"]),
                  list(df["actives"]))

        sd = StatisticalDescriptors()

        rank = sd.RankStatistics(new, operator.itemgetter(2))
        tpr, fpr = rank.ROC()

        if int(r) != 100:
            time.append(
                np.median(np.array([float(a) for a in list(df['time'])])))
            ef1.append(rank.EF(0.01))
            con.append(int(r))
            se.append(int(s))

        efl.extend(erich_levels)
        ef.extend([rank.EF(e) for e in erich_levels])
        weight.extend([r] * len(erich_levels))
def rank_stats(parent, s, w):
    """
    Create two `pandas.DataFrame` from the "bestranking.lst"
    GOLD output

    :param str parent: path to parent directory
    :param str s: name of subdirectory
    :param str w: name of subsubdirectory
    :return:
    """
    # read data and process data from output file
    fname = os.path.join(parent, f"search_efficiency_{s}", str(w),
                         "bestranking.lst")
    lines = [l.strip("\n") for l in open(fname, "r").readlines()]
    header = lines[5]
    header = [
        b.strip()
        for b in [a for a in header.split("  ") if a != '' and a != '#']
    ]
    data = lines[7:]

    cat = list(
        zip(*[[a for a in entry.split(" ") if a != ''] for entry in data]))

    # generate a dataframe and alter datatypes
    df = pd.DataFrame({h: cat[i] for i, h in enumerate(header)})
    df["actives"] = np.array(
        list(map(lambda x: 'CHEMBL' in x,
                 list(df['Ligand name'])))).astype(int)
    df["search_efficiency"] = [int(s)] * len(df)
    df["weight_int"] = [int(w)] * len(df)
    df["weight_str"] = [str(w)] * len(df)
    df["Score"] = df["Score"].astype(float)
    df["time"] = df["time"].astype(float)
    df["log_time"] = np.log10(list(df["time"]))
    df = df[[
        'Score', 'log_time', 'actives', 'search_efficiency', 'weight_int',
        'weight_str'
    ]]
    df = df.sort_values(by=['Score'], ascending=False)

    # Use CCDC's descriptors API
    rs = sd.RankStatistics(scores=list(
        zip(list(df['Score']), list(df['actives']))),
                           activity_column=operator.itemgetter(1))

    # ROC
    tpr, fpr = rs.ROC()
    df["tpr"] = tpr
    df["fpr"] = fpr

    # Enrichment Metrics
    metric_df = pd.DataFrame({
        "search efficiency": [s],
        "weight": [w],
        "AUC": [rs.AUC()],
        "EF1": [rs.EF(fraction=0.01)],
        "EF5": [rs.EF(fraction=0.05)],
        "EF10": [rs.EF(fraction=0.1)],
        "BEDROC16": [rs.BEDROC(alpha=16.1)],
        "BEDROC8": [rs.BEDROC(alpha=8)]
    })
    return df, metric_df
Beispiel #7
0
def rank_hits(hits, rank_by, totals, num, t):
    # rank
    if rank_by == "simple_score":
        hit_by_score = {hit: hit.simple_score for hit in hits}
        reverse = True
    elif rank_by == "sphere_score":
        hit_by_score = {hit: hit.sphere_score for hit in hits}
        reverse = True
    elif rank_by == "rmsd":
        hit_by_score = {hit: hit.rmsd for hit in hits}
        reverse = False

    all_hits = list(
        OrderedDict(
            sorted(hit_by_score.items(),
                   key=lambda item: item[1],
                   reverse=reverse)).keys())

    # deduplicate
    seen = []
    unique_hits = []
    for h in all_hits:
        ident = h.identifier.split("_")[0]

        if ident not in seen:
            unique_hits.append(h)
            seen.append(ident)

    # roc stats
    ret = [h.activity for h in unique_hits]
    counter = Counter(ret)
    missing_actives = totals["actives"] - counter[1]
    missing_decoys = totals["decoys"] - counter[0]

    num_unique_hits = len(unique_hits)

    # add blanks for the rank stats
    unique_hits.extend([
        Hit(molecule=None, rmsd=None, activity=0, identifier="blank")
        for i in range(missing_decoys)
    ])
    unique_hits.extend([
        Hit(molecule=None, rmsd=None, activity=1, identifier="blank")
        for i in range(missing_actives)
    ])

    rs = sd.RankStatistics(scores=unique_hits,
                           activity_column=operator.attrgetter("activity"))

    enrichment_stats = pd.DataFrame({
        "target": [t],
        "num_features": [num],
        "total_actives": [totals["actives"]],
        "total_decoys": [totals["decoys"]],
        "returned_actives": [counter[1]],
        "returned_decoys": [counter[0]],
        "return_perc":
        [num_unique_hits / (totals["actives"] + totals["decoys"])],
        "score_type": [rank_by],
        "EF0.5": [rs.EF(fraction=0.005)],
        "EF1": [rs.EF(fraction=0.01)],
        "EF2": [rs.EF(fraction=0.02)]
    })

    return enrichment_stats