def f3(): df = df_paper plotBucket(df, "paperId", "accepted", x_label="Submission Order", y_label="P(Accept)", title="Acceptance vs. Submission Order", numBuckets=10)
def f3(): df = df_paper plotBucket( df, "paperId", "accepted", x_label="Submission Order", y_label="P(Accept)", title="Acceptance vs. Submission Order", numBuckets=10 )
def f2(): df = pd.merge(df_review.groupby("userId")["rating"].std().reset_index(), df_reviewer, on="userId") df["var"] = df["rating"]**2 plotBucket( df, "dateFirstPaper", "var", x_label="Date of First Paper", y_label="Variance of Ratings", x_percentile=False, title="Rating Variance vs. Reviewer Seniority", numBuckets=5, )
def f2(): df = pd.merge( df_review.groupby("userId")["rating"].std().reset_index(), df_reviewer, on="userId") df["var"] = df["rating"] ** 2 plotBucket( df, "dateFirstPaper", "var", x_label="Date of First Paper", y_label="Variance of Ratings", x_percentile=False, title="Rating Variance vs. Reviewer Seniority", numBuckets=5, )
def f1(): df = pd.merge( df_review.groupby("userId")["reviewLength"].mean().reset_index(), df_reviewer, on="userId") plotBucket(df, "numBids", "reviewLength", x_label="Number of Bids", y_label="Average Review Length", x_percentile=False, title="Review Quality vs. Number of Bids", numBuckets=7, xlim=[0, 100]) plotFrequencyHistogram(df, 'numBids', 'Number of Bids', myBins=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
def f1(): df = pd.merge( df_review.groupby("userId")["reviewLength"].mean().reset_index(), df_reviewer, on="userId") plotBucket( df, "numBids", "reviewLength", x_label="Number of Bids", y_label="Average Review Length", x_percentile=False, title="Review Quality vs. Number of Bids", numBuckets=7, xlim=[0,100] ) plotFrequencyHistogram( df, 'numBids', 'Number of Bids', myBins=[0,5,10,15,20,25,30, 35, 40, 45, 50] )
from pylab import * with warnings.catch_warnings(): warnings.simplefilter("ignore") import pandas as pd from utilities.plotBucket import plotBucket from utilities.plotBucket import plotBar df_rating = pd.read_pickle( "savedFrames/ratingPrediction/ratingTable") df_paper = pd.read_pickle( "savedFrames/summaryStatistics/papersTable") p1 = plotBucket( df_rating, "authorsMaxPastPaper", "rating", color="blue", x_label="Number of Past Papers", y_label="Average Rating") setp(p1, linewidth=1.5) p2 = plotBucket( df_rating, "primaryAuthorPastPaperCount", "rating", color="red", sameFigure=True, marker="s") setp(p2, linewidth=1.5) legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4)
df = pd.merge(df, df_mostSimilar, on=['paperId']) df = pd.merge(df, papersFrame[["paperId", "maxConnectivity", "maxPageRank", "maxEigenCenter", "maxDegCenter", "primarySpecificSubjectArea", "primarySubjectArea"]], on=["paperId"]) #Average Rating vs. Number of Past Papers p1 = plotBucket( df_rating, "authorsMaxPastPaper", "rating", delta=10, color="blue", x_label="Number of Past Papers", y_label="Average Rating") setp(p1, linewidth=1.5) p2 = plotBucket( df_rating, "primaryAuthorPastPaperCount", "rating", delta=10, color="red", sameFigure=True, marker="s") setp(p2, linewidth=1.5) legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4)
fig = plt.figure() fig.set_facecolor("white") p = plt.errorbar(range(1, 8), meansAvg, marker="o", color="red", yerr=stdErrorsAvg) plt.xlim([0, 8]) plt.ylabel('Avg Rating of Paper') plt.xlabel('Avg Distance between Reviewer and Paper Authors') fig.suptitle('Avg Rating Given Avg Distance') plotBucket(reviewFrame, "avgDist", "rating", delta=10, y_label="Avg Rating of Paper", x_label="Avg Distance between Reviewer and Paper Authors", color="red", x_percentile=False) # plt.figure(22) # plt.scatter(reviewsWithHistory['numPastPapers'], reviewsWithHistory['rating'] + np.random.normal(loc=0, scale=0.2, size=2971), # c = ['red' if a else 'blue' for a in reviewsWithHistory['accepted']]) meansPaper = [] stdDevsPaper = [] numsPaper = [] stdErrorsPaper = [] bins = [0, 5, 10, 20, 40, 80, 160, 320, 640] for i in range(len(bins) - 1): meansPaper.append(reviewsWithHistory[
df_paper = pd.read_pickle("savedFrames/iteration5/paperTable") df_review = pd.read_pickle("savedFrames/iteration5/reviewTable") df_industry = pd.read_pickle("savedFrames/iteration5/industryReviewTable") df_author = pd.read_pickle("savedFrames/iteration5/authorTable") #SECTION 1 #PAST PAPER COUNTS df = df_paper # [df_paper["paperCountPrimary"] > 0] primary_different =\ (df["paperCountPrimary"] != df["paperCountMax"])\ | (df["paperCountMax"] == df["paperCount2ndHighest"]) df = df_paper plotBucket(df, "paperCountAvg", "avgRating", numBuckets=5, color="green", x_label="Number of Past Papers", y_label="Average Rating", title="Rating vs. Average Past Paper Counts of Authors") ylim([-1.8, 0]) p1 = plotBucket(df, "topPaperCountAvg", "avgRating", numBuckets=6, color="green", x_label="Number of Past Papers", y_label="Average Rating", title="Rating vs. Average Past Paper Counts of Authors") p2 = plotBucket(df, "paperCountAvg", "avgRating",
authorFrame = userFrame[userFrame["#Papers"] > 0] authorFrame["acceptanceRate"] = authorFrame["#Accepted"] \ * 1.0/authorFrame["#Papers"] plotBar(authorFrame, "#Papers", "acceptanceRate", [0, 1, 2, 3, 4, 5], title="Acceptance Rate vs. Number of Submissions", x_label="Number of Submissions", y_label="Acceptance Rate", xlim=[0, 6]) plotBucket(userFrame, "#PastPapers", "#Papers", x_label="Number of Past Papers", y_label="Number of Submissions", x_percentile=False, xlim=[0, 200]) plotBucket(authorFrame, "#PastPapers", "acceptanceRate", x_label="Number of Past Papers", y_label="Acceptance Rate", x_percentile=False, color="Red", xlim=[0, 200]) plotBucket(papersFrame, "authorsMaxPastPaper",
"savedFrames/iteration5/industryReviewTable") df_author = pd.read_pickle( "savedFrames/iteration5/authorTable") #SECTION 1 #PAST PAPER COUNTS df = df_paper # [df_paper["paperCountPrimary"] > 0] primary_different =\ (df["paperCountPrimary"] != df["paperCountMax"])\ | (df["paperCountMax"] == df["paperCount2ndHighest"]) df = df_paper plotBucket( df, "paperCountAvg", "avgRating", numBuckets=5, color="green", x_label="Number of Past Papers", y_label="Average Rating", title="Rating vs. Average Past Paper Counts of Authors") ylim([-1.8, 0]) p1 = plotBucket( df, "topPaperCountAvg", "avgRating", numBuckets=6, color="green", x_label="Number of Past Papers", y_label="Average Rating", title="Rating vs. Average Past Paper Counts of Authors") p2 = plotBucket(
def plotTest(df, col, col2): plotBucket( df, col, col2)
"paperId", "userId2", "rating2", "authorReviewerSimilarity2", "pastPaperSimilarity2", "maxAuthorReviewerSimilarity2", "minDist2" ] df = pd.merge(df, df2, on='paperId') df = df[(df["userId"] != df["userId2"]) & (df["rating"] != df["rating2"])] df["simDiff"] = df["authorReviewerSimilarity"] - df["authorReviewerSimilarity2"] df["distDiff"] = df["minDist"] - df["minDist2"] df["higherRating"] = df["rating"] > df["rating2"] plotBucket(df, "simDiff", "higherRating", x_label="Difference in Author/Reviewer Similarity", y_label="Probability of a Higher Rating", delta=10, x_percentile=False, xlim=[-.1, .1]) df_dist = df[df["distDiff"] >= 0] plotBucket(df, "distDiff", "higherRating", delta=5, x_label="Difference in Min Distance to Reviewer", y_label="Probability of a Higher Rating", color="Green", x_percentile=False) plotBar(df_original,
import warnings from pylab import * with warnings.catch_warnings(): warnings.simplefilter("ignore") import pandas as pd from utilities.plotBucket import plotBucket from utilities.plotBucket import plotBar df_rating = pd.read_pickle("savedFrames/ratingPrediction/ratingTable") df_paper = pd.read_pickle("savedFrames/summaryStatistics/papersTable") p1 = plotBucket(df_rating, "authorsMaxPastPaper", "rating", color="blue", x_label="Number of Past Papers", y_label="Average Rating") setp(p1, linewidth=1.5) p2 = plotBucket(df_rating, "primaryAuthorPastPaperCount", "rating", color="red", sameFigure=True, marker="s") setp(p2, linewidth=1.5) legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4) p1 = plotBucket(df_paper, "authorsMaxPastPaper", "accepted",
"minDist2", ] df = pd.merge(df, df2, on="paperId") df = df[(df["userId"] != df["userId2"]) & (df["rating"] != df["rating2"])] df["simDiff"] = df["authorReviewerSimilarity"] - df["authorReviewerSimilarity2"] df["distDiff"] = df["minDist"] - df["minDist2"] df["higherRating"] = df["rating"] > df["rating2"] plotBucket( df, "simDiff", "higherRating", x_label="Difference in Author/Reviewer Similarity", y_label="Probability of a Higher Rating", delta=10, x_percentile=False, xlim=[-0.1, 0.1], ) df_dist = df[df["distDiff"] >= 0] plotBucket( df, "distDiff", "higherRating", delta=5, x_label="Difference in Min Distance to Reviewer", y_label="Probability of a Higher Rating", color="Green", x_percentile=False,
df_paper = pd.read_pickle( "savedFrames/summaryStatistics/papersTable") df_dist = pd.read_pickle( "savedFrames/reviewStatistics/reviewTable") userFrame = pd.read_pickle( "savedFrames/summaryStatistics/userTable") df_mostSimilar = pd.read_pickle( "savedFrames/ratingPrediction/mostSimilarTable") df_merged = pd.merge(df_rating, df_dist, on=['paperId', 'userId', 'rating']) df_merged = pd.merge(df_merged, df_mostSimilar, on=['paperId']) #Plot past paper count vs. average rating p1 = plotBucket( df_rating, "authorsMaxPastPaper", "rating", color="blue", x_label="Number of Past Papers", y_label="Average Rating") setp(p1, linewidth=1.5) p2 = plotBucket( df_rating, "primaryAuthorPastPaperCount", "rating", color="red", sameFigure=True, marker="s") setp(p2, linewidth=1.5) legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4)
maxSimCol = "maxAuthorReviewerSimilarity" def plotTest(df, col, col2): plotBucket( df, col, col2) p = plotBucket( df, maxSimCol, "rating", x_label="Similarity", y_label="Rating", delta=15, title= "Average Rating vs. Similarity of Reviewer to Most Experienced Author", marker="s", color="green", xlim=[0, 100] ) setp(p, linewidth=2, alpha=1) p = plotBucket( df, "authorReviewerSimilarity", "rating", x_label="Similarity", y_label="Rating", delta=15,
" Review Submission\n Deadline", va="top", color="gray", alpha=.8) labels_month = ['3/20', '3/30', '4/10', '4/20', '4/30'] dates_month = transformDates(labels_month) #Plot 3 -- Submission Time vs. Rating plotBucket( df, 'time', 'rating', numBuckets=5, x_percentile=False, title='Rating v. Submission Date', x_label="Submission Date", y_label="Average Review Rating", plotMean=False, ) plotBucket( df_more, 'time', 'rating', numBuckets=5, x_percentile=False, title='Rating v. Submission Date', x_label="Submission Date", y_label="Average Review Rating", color="red",
) stdErrorsAvg.append(stdDevsAvg[-1] / math.sqrt(numsAvg[-1])) fig = plt.figure() fig.set_facecolor("white") p = plt.errorbar(range(1, 8), meansAvg, marker="o", color="red", yerr=stdErrorsAvg) plt.xlim([0, 8]) plt.ylabel("Avg Rating of Paper") plt.xlabel("Avg Distance between Reviewer and Paper Authors") fig.suptitle("Avg Rating Given Avg Distance") plotBucket( reviewFrame, "avgDist", "rating", delta=10, y_label="Avg Rating of Paper", x_label="Avg Distance between Reviewer and Paper Authors", color="red", x_percentile=False, ) # plt.figure(22) # plt.scatter(reviewsWithHistory['numPastPapers'], reviewsWithHistory['rating'] + np.random.normal(loc=0, scale=0.2, size=2971), # c = ['red' if a else 'blue' for a in reviewsWithHistory['accepted']]) meansPaper = [] stdDevsPaper = [] numsPaper = [] stdErrorsPaper = [] bins = [0, 5, 10, 20, 40, 80, 160, 320, 640]
plotBar( authorFrame, "#Papers", "acceptanceRate", [0, 1, 2, 3, 4, 5], title="Acceptance Rate vs. Number of Submissions", x_label="Number of Submissions", y_label="Acceptance Rate", xlim=[0, 6] ) plotBucket( userFrame, "#PastPapers", "#Papers", x_label="Number of Past Papers", y_label="Number of Submissions", x_percentile=False, xlim=[0, 200]) plotBucket( authorFrame, "#PastPapers", "acceptanceRate", x_label="Number of Past Papers", y_label="Acceptance Rate", x_percentile=False, color="Red", xlim=[0, 200])