def f3():
    df = df_paper

    plotBucket(df,
               "paperId",
               "accepted",
               x_label="Submission Order",
               y_label="P(Accept)",
               title="Acceptance vs. Submission Order",
               numBuckets=10)
Example #2
0
def f3():
	df = df_paper

	plotBucket(
		df,
		"paperId",
		"accepted",
		x_label="Submission Order",
		y_label="P(Accept)",
		title="Acceptance vs. Submission Order",
		numBuckets=10
	)
def f2():
    df = pd.merge(df_review.groupby("userId")["rating"].std().reset_index(),
                  df_reviewer,
                  on="userId")

    df["var"] = df["rating"]**2

    plotBucket(
        df,
        "dateFirstPaper",
        "var",
        x_label="Date of First Paper",
        y_label="Variance of Ratings",
        x_percentile=False,
        title="Rating Variance vs. Reviewer Seniority",
        numBuckets=5,
    )
Example #4
0
def f2():
	df = pd.merge(
		df_review.groupby("userId")["rating"].std().reset_index(),
		df_reviewer,
		on="userId")

	df["var"] = df["rating"] ** 2

	plotBucket(
		df,
		"dateFirstPaper",
		"var",
		x_label="Date of First Paper",
		y_label="Variance of Ratings",
		x_percentile=False,
		title="Rating Variance vs. Reviewer Seniority",
		numBuckets=5,
	)
def f1():
    df = pd.merge(
        df_review.groupby("userId")["reviewLength"].mean().reset_index(),
        df_reviewer,
        on="userId")

    plotBucket(df,
               "numBids",
               "reviewLength",
               x_label="Number of Bids",
               y_label="Average Review Length",
               x_percentile=False,
               title="Review Quality vs. Number of Bids",
               numBuckets=7,
               xlim=[0, 100])

    plotFrequencyHistogram(df,
                           'numBids',
                           'Number of Bids',
                           myBins=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
Example #6
0
def f1():
	df = pd.merge(
		df_review.groupby("userId")["reviewLength"].mean().reset_index(),
		df_reviewer,
		on="userId")

	plotBucket(
		df,
		"numBids",
		"reviewLength",
		x_label="Number of Bids",
		y_label="Average Review Length",
		x_percentile=False,
		title="Review Quality vs. Number of Bids",
		numBuckets=7,
		xlim=[0,100]
	)

	plotFrequencyHistogram(
	    df,
	    'numBids',
	    'Number of Bids',
	    myBins=[0,5,10,15,20,25,30, 35, 40, 45, 50]
	)
Example #7
0
from pylab import *
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import pandas as pd
from utilities.plotBucket import plotBucket
from utilities.plotBucket import plotBar

df_rating = pd.read_pickle(
    "savedFrames/ratingPrediction/ratingTable")
df_paper = pd.read_pickle(
    "savedFrames/summaryStatistics/papersTable")

p1 = plotBucket(
    df_rating,
    "authorsMaxPastPaper",
    "rating",
    color="blue",
    x_label="Number of Past Papers",
    y_label="Average Rating")
setp(p1, linewidth=1.5)

p2 = plotBucket(
    df_rating,
    "primaryAuthorPastPaperCount",
    "rating",
    color="red",
    sameFigure=True,
    marker="s")
setp(p2, linewidth=1.5)
legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4)
df = pd.merge(df, df_mostSimilar, on=['paperId'])
df = pd.merge(df,
              papersFrame[["paperId",
                           "maxConnectivity",
                           "maxPageRank",
                           "maxEigenCenter",
                           "maxDegCenter",
                           "primarySpecificSubjectArea",
                           "primarySubjectArea"]],
              on=["paperId"])

#Average Rating vs. Number of Past Papers
p1 = plotBucket(
    df_rating,
    "authorsMaxPastPaper",
    "rating",
    delta=10,
    color="blue",
    x_label="Number of Past Papers",
    y_label="Average Rating")
setp(p1, linewidth=1.5)

p2 = plotBucket(
    df_rating,
    "primaryAuthorPastPaperCount",
    "rating",
    delta=10,
    color="red",
    sameFigure=True,
    marker="s")
setp(p2, linewidth=1.5)
legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4)
fig = plt.figure()
fig.set_facecolor("white")
p = plt.errorbar(range(1, 8),
                 meansAvg,
                 marker="o",
                 color="red",
                 yerr=stdErrorsAvg)
plt.xlim([0, 8])
plt.ylabel('Avg Rating of Paper')
plt.xlabel('Avg Distance between Reviewer and Paper Authors')
fig.suptitle('Avg Rating Given Avg Distance')

plotBucket(reviewFrame,
           "avgDist",
           "rating",
           delta=10,
           y_label="Avg Rating of Paper",
           x_label="Avg Distance between Reviewer and Paper Authors",
           color="red",
           x_percentile=False)

# plt.figure(22)
# plt.scatter(reviewsWithHistory['numPastPapers'], reviewsWithHistory['rating'] + np.random.normal(loc=0, scale=0.2, size=2971),
#     c = ['red' if a else 'blue' for a in reviewsWithHistory['accepted']])

meansPaper = []
stdDevsPaper = []
numsPaper = []
stdErrorsPaper = []
bins = [0, 5, 10, 20, 40, 80, 160, 320, 640]
for i in range(len(bins) - 1):
    meansPaper.append(reviewsWithHistory[
df_paper = pd.read_pickle("savedFrames/iteration5/paperTable")
df_review = pd.read_pickle("savedFrames/iteration5/reviewTable")
df_industry = pd.read_pickle("savedFrames/iteration5/industryReviewTable")
df_author = pd.read_pickle("savedFrames/iteration5/authorTable")

#SECTION 1
#PAST PAPER COUNTS
df = df_paper  # [df_paper["paperCountPrimary"] > 0]
primary_different =\
    (df["paperCountPrimary"] != df["paperCountMax"])\
    | (df["paperCountMax"] == df["paperCount2ndHighest"])
df = df_paper
plotBucket(df,
           "paperCountAvg",
           "avgRating",
           numBuckets=5,
           color="green",
           x_label="Number of Past Papers",
           y_label="Average Rating",
           title="Rating vs. Average Past Paper Counts of Authors")
ylim([-1.8, 0])
p1 = plotBucket(df,
                "topPaperCountAvg",
                "avgRating",
                numBuckets=6,
                color="green",
                x_label="Number of Past Papers",
                y_label="Average Rating",
                title="Rating vs. Average Past Paper Counts of Authors")
p2 = plotBucket(df,
                "paperCountAvg",
                "avgRating",
Example #11
0
authorFrame = userFrame[userFrame["#Papers"] > 0]
authorFrame["acceptanceRate"] = authorFrame["#Accepted"] \
    * 1.0/authorFrame["#Papers"]
plotBar(authorFrame,
        "#Papers",
        "acceptanceRate", [0, 1, 2, 3, 4, 5],
        title="Acceptance Rate vs. Number of Submissions",
        x_label="Number of Submissions",
        y_label="Acceptance Rate",
        xlim=[0, 6])

plotBucket(userFrame,
           "#PastPapers",
           "#Papers",
           x_label="Number of Past Papers",
           y_label="Number of Submissions",
           x_percentile=False,
           xlim=[0, 200])

plotBucket(authorFrame,
           "#PastPapers",
           "acceptanceRate",
           x_label="Number of Past Papers",
           y_label="Acceptance Rate",
           x_percentile=False,
           color="Red",
           xlim=[0, 200])

plotBucket(papersFrame,
           "authorsMaxPastPaper",
    "savedFrames/iteration5/industryReviewTable")
df_author = pd.read_pickle(
    "savedFrames/iteration5/authorTable")

#SECTION 1
#PAST PAPER COUNTS
df = df_paper  # [df_paper["paperCountPrimary"] > 0]
primary_different =\
    (df["paperCountPrimary"] != df["paperCountMax"])\
    | (df["paperCountMax"] == df["paperCount2ndHighest"])
df = df_paper
plotBucket(
    df,
    "paperCountAvg",
    "avgRating",
    numBuckets=5,
    color="green",
    x_label="Number of Past Papers",
    y_label="Average Rating",
    title="Rating vs. Average Past Paper Counts of Authors")
ylim([-1.8, 0])
p1 = plotBucket(
    df,
    "topPaperCountAvg",
    "avgRating",
    numBuckets=6,
    color="green",
    x_label="Number of Past Papers",
    y_label="Average Rating",
    title="Rating vs. Average Past Paper Counts of Authors")
p2 = plotBucket(
Example #13
0
def plotTest(df, col, col2):
    plotBucket(
        df,
        col,
        col2)
Example #14
0
    "paperId", "userId2", "rating2", "authorReviewerSimilarity2",
    "pastPaperSimilarity2", "maxAuthorReviewerSimilarity2", "minDist2"
]

df = pd.merge(df, df2, on='paperId')

df = df[(df["userId"] != df["userId2"]) & (df["rating"] != df["rating2"])]

df["simDiff"] = df["authorReviewerSimilarity"] - df["authorReviewerSimilarity2"]
df["distDiff"] = df["minDist"] - df["minDist2"]
df["higherRating"] = df["rating"] > df["rating2"]

plotBucket(df,
           "simDiff",
           "higherRating",
           x_label="Difference in Author/Reviewer Similarity",
           y_label="Probability of a Higher Rating",
           delta=10,
           x_percentile=False,
           xlim=[-.1, .1])

df_dist = df[df["distDiff"] >= 0]
plotBucket(df,
           "distDiff",
           "higherRating",
           delta=5,
           x_label="Difference in Min Distance to Reviewer",
           y_label="Probability of a Higher Rating",
           color="Green",
           x_percentile=False)

plotBar(df_original,
Example #15
0
import warnings
from pylab import *
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import pandas as pd
from utilities.plotBucket import plotBucket
from utilities.plotBucket import plotBar

df_rating = pd.read_pickle("savedFrames/ratingPrediction/ratingTable")
df_paper = pd.read_pickle("savedFrames/summaryStatistics/papersTable")

p1 = plotBucket(df_rating,
                "authorsMaxPastPaper",
                "rating",
                color="blue",
                x_label="Number of Past Papers",
                y_label="Average Rating")
setp(p1, linewidth=1.5)

p2 = plotBucket(df_rating,
                "primaryAuthorPastPaperCount",
                "rating",
                color="red",
                sameFigure=True,
                marker="s")
setp(p2, linewidth=1.5)
legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4)

p1 = plotBucket(df_paper,
                "authorsMaxPastPaper",
                "accepted",
Example #16
0
    "minDist2",
]

df = pd.merge(df, df2, on="paperId")

df = df[(df["userId"] != df["userId2"]) & (df["rating"] != df["rating2"])]

df["simDiff"] = df["authorReviewerSimilarity"] - df["authorReviewerSimilarity2"]
df["distDiff"] = df["minDist"] - df["minDist2"]
df["higherRating"] = df["rating"] > df["rating2"]

plotBucket(
    df,
    "simDiff",
    "higherRating",
    x_label="Difference in Author/Reviewer Similarity",
    y_label="Probability of a Higher Rating",
    delta=10,
    x_percentile=False,
    xlim=[-0.1, 0.1],
)

df_dist = df[df["distDiff"] >= 0]
plotBucket(
    df,
    "distDiff",
    "higherRating",
    delta=5,
    x_label="Difference in Min Distance to Reviewer",
    y_label="Probability of a Higher Rating",
    color="Green",
    x_percentile=False,
df_paper = pd.read_pickle(
    "savedFrames/summaryStatistics/papersTable")
df_dist = pd.read_pickle(
    "savedFrames/reviewStatistics/reviewTable")
userFrame = pd.read_pickle(
    "savedFrames/summaryStatistics/userTable")
df_mostSimilar = pd.read_pickle(
    "savedFrames/ratingPrediction/mostSimilarTable")
df_merged = pd.merge(df_rating, df_dist, on=['paperId', 'userId', 'rating'])
df_merged = pd.merge(df_merged, df_mostSimilar, on=['paperId'])

#Plot past paper count vs. average rating
p1 = plotBucket(
    df_rating,
    "authorsMaxPastPaper",
    "rating",
    color="blue",
    x_label="Number of Past Papers",
    y_label="Average Rating")
setp(p1, linewidth=1.5)

p2 = plotBucket(
    df_rating,
    "primaryAuthorPastPaperCount",
    "rating",
    color="red",
    sameFigure=True,
    marker="s")
setp(p2, linewidth=1.5)
legend([p1, p2], ["Most Experienced Author", "Primary Author"], loc=4)
Example #18
0
maxSimCol = "maxAuthorReviewerSimilarity"


def plotTest(df, col, col2):
    plotBucket(
        df,
        col,
        col2)

p = plotBucket(
    df,
    maxSimCol,
    "rating",
    x_label="Similarity",
    y_label="Rating",
    delta=15,
    title=
    "Average Rating vs. Similarity of Reviewer to Most Experienced Author",
    marker="s",
    color="green",
    xlim=[0, 100]
)
setp(p, linewidth=2, alpha=1)

p = plotBucket(
    df,
    "authorReviewerSimilarity",
    "rating",
    x_label="Similarity",
    y_label="Rating",
    delta=15,
Example #19
0
             "  Review Submission\n  Deadline",
             va="top",
             color="gray",
             alpha=.8)


labels_month = ['3/20', '3/30', '4/10', '4/20', '4/30']
dates_month = transformDates(labels_month)

#Plot 3 -- Submission Time vs. Rating
plotBucket(
    df,
    'time',
    'rating',
    numBuckets=5,
    x_percentile=False,
    title='Rating v. Submission Date',
    x_label="Submission Date",
    y_label="Average Review Rating",
    plotMean=False,
)
plotBucket(
    df_more,
    'time',
    'rating',
    numBuckets=5,
    x_percentile=False,
    title='Rating v. Submission Date',
    x_label="Submission Date",
    y_label="Average Review Rating",
    color="red",
    )
    stdErrorsAvg.append(stdDevsAvg[-1] / math.sqrt(numsAvg[-1]))

fig = plt.figure()
fig.set_facecolor("white")
p = plt.errorbar(range(1, 8), meansAvg, marker="o", color="red", yerr=stdErrorsAvg)
plt.xlim([0, 8])
plt.ylabel("Avg Rating of Paper")
plt.xlabel("Avg Distance between Reviewer and Paper Authors")
fig.suptitle("Avg Rating Given Avg Distance")

plotBucket(
    reviewFrame,
    "avgDist",
    "rating",
    delta=10,
    y_label="Avg Rating of Paper",
    x_label="Avg Distance between Reviewer and Paper Authors",
    color="red",
    x_percentile=False,
)


# plt.figure(22)
# plt.scatter(reviewsWithHistory['numPastPapers'], reviewsWithHistory['rating'] + np.random.normal(loc=0, scale=0.2, size=2971),
#     c = ['red' if a else 'blue' for a in reviewsWithHistory['accepted']])

meansPaper = []
stdDevsPaper = []
numsPaper = []
stdErrorsPaper = []
bins = [0, 5, 10, 20, 40, 80, 160, 320, 640]
Example #21
0
plotBar(
    authorFrame,
    "#Papers",
    "acceptanceRate",
    [0, 1, 2, 3, 4, 5],
    title="Acceptance Rate vs. Number of Submissions",
    x_label="Number of Submissions",
    y_label="Acceptance Rate",
    xlim=[0, 6]
)


plotBucket(
    userFrame,
    "#PastPapers",
    "#Papers",
    x_label="Number of Past Papers",
    y_label="Number of Submissions",
    x_percentile=False,
    xlim=[0, 200])


plotBucket(
    authorFrame,
    "#PastPapers",
    "acceptanceRate",
    x_label="Number of Past Papers",
    y_label="Acceptance Rate",
    x_percentile=False,
    color="Red",
    xlim=[0, 200])