def getWholePost(db, postId):
    """ get a (title, post and all its answers) """
    title = None
    wholePost = None
    for question in util.iterateQuestions(db, postList=[postId]):
        answers = "\n\n".join([answer.body for answer in util.iterateAnswers(db, postId)])
        title = question.title
        wholePost = "\n\n".join([question.title, question.body, answers, question.tags])
    return (title, wholePost)
Example #2
0
def getWholePost(db, postId):
    """ get a (title, post and all its answers) """
    title = None
    wholePost = None
    for question in util.iterateQuestions(db, postList=[postId]):
        answers = "\n\n".join(
            [answer.body for answer in util.iterateAnswers(db, postId)])
        title = question.title
        wholePost = "\n\n".join(
            [question.title, question.body, answers, question.tags])
    return (title, wholePost)
 def __iter__(self):
     for question in util.iterateQuestions(self.db, self.topic, self.postList):
         answers = [answer for answer in util.iterateAnswers(self.db, [question.id])]
         tokens = tokenizePost(question.title, question.body, [answer.body for answer in answers], question.tags)
         if Config.debug and self.ctr > 0 and (self.ctr % 5000)==0:
             now = time.time()
             print >>sys.stderr, "Posts imported:", self.ctr, "(in %0.1fs, %0.2fpost/s)" % (
                 (now-self.t0), self.ctr/(now-self.tbegin)
                 )
                 
             self.t0 = now
         self.corpusToPost[self.ctr] = question.id 
         self.ctr += 1    
         yield self.dictionary.doc2bow([utoken for utoken in self.unicodifyTokens(tokens)], allow_update=True)
Example #4
0
    def __iter__(self):
        for question in util.iterateQuestions(self.db, self.topic,
                                              self.postList):
            answers = [
                answer
                for answer in util.iterateAnswers(self.db, [question.id])
            ]
            tokens = tokenizePost(question.title, question.body,
                                  [answer.body for answer in answers],
                                  question.tags)
            if Config.debug and self.ctr > 0 and (self.ctr % 5000) == 0:
                now = time.time()
                print >> sys.stderr, "Posts imported:", self.ctr, "(in %0.1fs, %0.2fpost/s)" % (
                    (now - self.t0), self.ctr / (now - self.tbegin))

                self.t0 = now
            self.corpusToPost[self.ctr] = question.id
            self.ctr += 1
            yield self.dictionary.doc2bow(
                [utoken for utoken in self.unicodifyTokens(tokens)],
                allow_update=True)
Example #5
0
def scoreUsers(db, query, queryResults, topicModel, cutoffPercentile=75, resultCutoff=0.5):
    """ return the value-weighted score of users in a set of posts
    the posts must be a list including .id, .post, .similarity (relevance)
    """
    class PostDetails:
        def __init__(self, questionId=0, answerId=0, title="", questionRelevance=0, answerRelevance=0):
            self.questionId = questionId
            self.answerId = answerId
            self.title = title
            self.questionRelevance = questionRelevance
            self.answerRelevance = answerRelevance

    class UserScore:
        def __init__(self, userId, user, score, meanRelevance, postIds):
            self.userId = userId
            self.user = user
            self.score = score
            self.meanRelevance = meanRelevance
            self.postIds = postIds
            self.nPosts = len(self.postIds)
            print repr(self)
        def __repr__(self):
            return repr((self.user, self.userId, self.score, self.meanRelevance))
        def starScore(self, cutoffPercentile=75, nStars=5):
            """ convert the score to a number of stars, based on percentileRank (which must be added separately)"""
            self.stars = int(min([nStars, (1+(self.percentileRank - cutoffPercentile - 1.0)//((100.0-cutoffPercentile)/nStars))]))
            return self
    ids = []
    relevance = []
    userIds = []
    postIds = []
    commentSentiment = []
    querySim = topic_classification.TopicModeling.QuerySimilarity(topicModel, query) 
    id2qr = { queryResult.post.id : queryResult for queryResult in queryResults }
    for answer in util.iterateAnswers(db, id2qr):
	useUserId = answer.owner_user_id if answer.owner_user_id is not None else answer.last_editor_user_id
        if useUserId:
	    questionQr = id2qr[answer.parent_id]
            ids.append(answer.id)
            userIds.append(useUserId)
            answerRelevance = 1.0 #querySim.similarity(answer.body)
            relevance.append(questionQr.similarity*answerRelevance)
            postIds.append(
                PostDetails(
                    questionId=answer.parent_id, 
                    answerId=answer.id, 
                    title=questionQr.post.title,
                    questionRelevance=questionQr.similarity, 
                    answerRelevance=answerRelevance
                )
            )
    logging.debug("iterating answers complete, getting prescores")
    ages, scores, favorites, views, accepted = getAnswerPrescores(db, ids)
    logging.debug("got prescores...getting sentiment")
    commentSentimentDict = scoreCommentSentiment(db, ids)
    #commentSentiment = getCommentSentiments(db, ids)
    commentSentiment = array([commentSentimentDict.get(ident, 0) for ident in ids])
    logging.debug("got sentiment...calculating scores...")
    scores += commentSentiment * sentimentFactor
    
    # calculate the scores of the posts for this query using a scoring heuristic
    postIds = array(postIds)
    relevance = array(relevance)
    accepted = array(accepted)
    pctScores = array([percentileofscore(scores, s, 'strict') for s in scores], double)/100.0
    pctFavorites = array([percentileofscore(favorites, f, 'strict') for f in favorites], double)/100.0
    pctViews = array([percentileofscore(views, v, 'strict') for v in views], double)/100.0
    postScore = relevance * (1.0+pctScores) * (1.0+pctFavorites) * (1.0 + pctViews) * (1.0 + acceptedBonus * accepted)
    userIdSet = frozenset(userIds)
    userIds = array(userIds)
    displayNames = {userId : displayName for (userId, displayName) in zip(userIdSet, util.usersById(db, userIdSet))}
    userScores = [
        UserScore(
            user,
            displayNames[user], 
            postScore[userIds==user].sum(), 
            relevance[userIds==user].mean(),
            postIds[userIds==user]
        ) for user in userIdSet]
    allUserScores = array([userScore.score for userScore in userScores], dtype=double)
    for userScore in userScores:
        userScore.percentileRank = percentileofscore(allUserScores, userScore.score)
    logging.debug("sorting users by score...")
    return sorted(filter(lambda us: us.percentileRank >= cutoffPercentile, userScores), key=lambda u: -u.score)