def get_title_from_comments(self, post, title): if post.created_utc - time.time() < MIN_COMMENT_CANDIDATE_DELAY: return if post.num_comments < 10: return if post.num_reports: return title_tokens = normalize(title, lambda x: x) for comment in self.comments_sequence(post.comments): if not isinstance(comment, MoreComments) and comment.created_utc + MIN_COMMENT_CANDIDATE_DELAY < post.created_utc: comment_tokens = normalize(comment.body, lambda x: x) if tokens_equals(title_tokens, comment_tokens): return comment.body
def get_title_from_comments(self, post, title): if post.created_utc - time.time() < MIN_COMMENT_CANDIDATE_DELAY: return if post.num_comments < 10: return if post.num_reports: return title_tokens = normalize(title, lambda x: x) for comment in self.comments_sequence(post.comments): if not isinstance( comment, MoreComments ) and comment.created_utc + MIN_COMMENT_CANDIDATE_DELAY < post.created_utc: comment_tokens = normalize(comment.body, lambda x: x) if tokens_equals(title_tokens, comment_tokens): return comment.body
def generate_data(self, subreddit, key_words): try: #todo fix rate limit #todo add posts statuses for page in xrange(MAX_PAGES): q = "tag:%s OR title:%s OR album:%s" % (subreddit, subreddit, subreddit) log.info("retrieve for %s at page %s" % (subreddit, page)) for entity in self.client.gallery_search(q=q, sort='time', page=page, window='week'): if entity.is_album: if entity.ups - entity.downs > 0 and entity.ups > MIN_UPS: images = [ random.choice( self.client.get_album_images(entity.id)) ] else: images = [] else: images = [entity] for image in images: if self.check(image): self.toggled.add(hash(normalize(image.title))) yield PostSource(image.link, self.process_title(image.title)) except Exception as e: log.exception(e) return
def check(self, image): if not image.title or hash(normalize(image.title)) in self.toggled or \ image.height < 500 or image.width < 500: return False copies = self.get_copies(image.id) if len(copies) == 0: return True
def is_valid_title(title): words = normalize(title, serialise=lambda x: x) if len(words) < MIN_WORDS_IN_TITLE: return False if set(words).intersection(title_stop_words): return False for validator in title_bad_validators: if validator.findall(title): return False return True
def generate_data(self, subreddit, key_words): try:#todo fix rate limit #todo add posts statuses for page in xrange(MAX_PAGES): q = "tag:%s OR title:%s OR album:%s" % (subreddit, subreddit, subreddit) log.info("retrieve for %s at page %s" % (subreddit, page)) for entity in self.client.gallery_search(q=q, sort='time', page=page, window='week'): if entity.is_album: if entity.ups - entity.downs > 0 and entity.ups > MIN_UPS: images = [random.choice(self.client.get_album_images(entity.id))] else: images=[] else: images = [entity] for image in images: if self.check(image): self.toggled.add(hash(normalize(image.title))) yield PostSource(image.link, self.process_title(image.title)) except Exception as e: log.exception(e) return
def check_comment_text(text, existed_comments_texts, exclude_words): """ Checking in db, and by is good and found similar text in post comments. Similar it is when tokens (only words) have equal length and full intersection :param text: :param post: :return: """ try: if is_good_text(text): ok, c_tokens = check_on_exclude(text, exclude_words) if ok: for comment in existed_comments_texts: p_text = comment.body if is_good_text(p_text): p_tokens = set(normalize(p_text)) if len(c_tokens) == len(p_tokens) and len(p_tokens.intersection(c_tokens)) == len(p_tokens): return False, None return True, c_tokens except Exception as e: log.exception(e) return False, None
def check_title(title): url_tokens = normalize(url, lambda x: x) title_tokens = normalize(title, lambda x: x) if len(set(url_tokens).intersection(set(title_tokens))) > 0: return False return True
def check_on_exclude(text, exclude_dict): c_tokens = set(normalize(text)) for token in c_tokens: if hash(token) in exclude_dict: return False, None return True, c_tokens