def score_response(self, comment, response): """ This function can be modified to give a good internal scoring for a response. If negative, we won't post. This is useful when there is more than one possible response in our database. """ # Discard the obviously bad responses if response.body.strip() == "[deleted]": return -1 simple_body = rewriter.simplify_body(comment.body) if response.score < config.good_comment_threshold: return -1 # Derive our base score. We use a logarithm, because reddit scores are # roughly logrithmic <http://amix.dk/blog/post/19588> base_score = math.log10(response.score) # A raw pentalty to subtract for the comment being in a different # context from it's parent response_parent = self.__get_parent(response) if response_parent is not None: similarity = self._get_parent_similarity_ratio(comment, response_parent) difference_penalty = math.log10(10000) * (1 - similarity) ** 10 else: difference_penalty = math.log10(10000) # give it some points for length length_reward = math.log10(len(simple_body)) # throw in some randomness for good luck fuzz_multiplier = random.gauss(mu=1, sigma=0.05) # put it all together final_score = (base_score - difference_penalty + length_reward) * fuzz_multiplier return final_score
def insert_comments(self, *comments, fast=False): if not comments: return comments_by_name = {c.name:c for c in comments} documents = [] for c in comments: r = c.reddit_session json = self.__comment_to_json(c) if fast or c.is_root: parent_body = None parent_simple_body = None else: try: parent = comments_by_name[c.parent_id] except KeyError: parent = r.get_info(thing_id=c.parent_id) parent_body = parent.body parent_simple_body = rewriter.simplify_body(parent_body) # We insert all of our database-specific stuff in "metadata", so # that we can easily remove it before constructing comment objects json["metadata"] = { "parent_simple_body": parent_simple_body, "parent_body": parent_body, "score": c.score, # ups minus downs "insert_time": time.time(), "database_format": DATABASE_FORMAT, } documents.append(json) self.comments.insert(documents)
def generate_comment_metadata(self, comment_json, comments_by_id={}, reddit_session=None): parent_body = parent_simple_body = None # find the parent body # Comments are of type t1, submissions are t3 if comment_json["parent_id"][:2] != "t3": # lookup in local table parent_id = comment_json["parent_id"] try: parent = comments_by_id[parent_id] if isinstance(parent, praw.objects.Comment): parent = parent.json_dict except KeyError: # lookup in database parent = self.comments.find_one({"name": parent_id}, {"body": True}) if parent is None and reddit_session is not None: # fall back to looking it up with the API parent = r.get_info(thing_id=comment.parent_id).json_dict # Pull out what we want for later if parent is not None: parent_body = parent["body"] parent_simple_body = rewriter.simplify_body(parent_body) # We shouldn't overwrite insert time if we can avoid it try: insert_time = comment_json["metadata"]["insert_time"] except KeyError: insert_time = time.time() # Put it all together return { "parent_simple_body": parent_simple_body, "parent_body": parent_body, "score": comment_json["ups"] - comment_json["downs"], "insert_time": insert_time, "database_format": DATABASE_FORMAT, }
def score_response(comment, response): """ This function can be modified to give a good internal scoring for a response. If negative, we won't post. """ if response.body.strip() == "[deleted]": return -1 simple_body = rewriter.simplify_body(comment.body) if response.score < 5: return -1 return (response.score - 40 + len(simple_body)) * random.gauss(1, .1)
def get_best_response(comment): simple_body = rewriter.simplify_body(comment.body) if simple_body in config.ignore_phrases: return None if len(simple_body) < 10 or simple_body.count(" ") < 2: return None responses = db.get_comments(r, { "$query": {"metadata.parent_simple_body": simple_body}, "$orderby": {"metadata.score": -1}, }) if not responses: return None best_response = max(zip(map( score_response, itertools.repeat(comment), responses), responses), key=lambda v:v[0]) if best_response[0] < 0: return None return best_response[1]
def get_best_response(self, comment): simple_body = rewriter.simplify_body(comment.body) if simple_body in config.ignore_phrases: return None if len(simple_body) < 10 or simple_body.count(" ") < 2: return None responses = self.database.get_comments( self.reddit_session, {"metadata.parent_simple_body": simple_body}, good_only=True, limit=100 ) if not responses: return None best_response = max( zip(map(self.score_response, itertools.repeat(comment), responses), responses), key=lambda v: v[0] ) if best_response[0] < 0: return None return best_response[1]