コード例 #1
0
    def get_title_from_comments(self, post, title):
        if post.created_utc - time.time() < MIN_COMMENT_CANDIDATE_DELAY: return
        if post.num_comments < 10: return
        if post.num_reports: return

        title_tokens = normalize(title, lambda x: x)
        for comment in self.comments_sequence(post.comments):
            if not isinstance(comment,
                              MoreComments) and comment.created_utc + MIN_COMMENT_CANDIDATE_DELAY < post.created_utc:
                comment_tokens = normalize(comment.body, lambda x: x)
                if tokens_equals(title_tokens, comment_tokens):
                    return comment.body
コード例 #2
0
ファイル: copy_gen.py プロジェクト: cash2one/generators
    def get_title_from_comments(self, post, title):
        if post.created_utc - time.time() < MIN_COMMENT_CANDIDATE_DELAY: return
        if post.num_comments < 10: return
        if post.num_reports: return

        title_tokens = normalize(title, lambda x: x)
        for comment in self.comments_sequence(post.comments):
            if not isinstance(
                    comment, MoreComments
            ) and comment.created_utc + MIN_COMMENT_CANDIDATE_DELAY < post.created_utc:
                comment_tokens = normalize(comment.body, lambda x: x)
                if tokens_equals(title_tokens, comment_tokens):
                    return comment.body
コード例 #3
0
ファイル: imgur_gen.py プロジェクト: cash2one/generators
    def generate_data(self, subreddit, key_words):
        try:  #todo fix rate limit
            #todo add posts statuses
            for page in xrange(MAX_PAGES):
                q = "tag:%s OR title:%s OR album:%s" % (subreddit, subreddit,
                                                        subreddit)
                log.info("retrieve for %s at page %s" % (subreddit, page))

                for entity in self.client.gallery_search(q=q,
                                                         sort='time',
                                                         page=page,
                                                         window='week'):
                    if entity.is_album:
                        if entity.ups - entity.downs > 0 and entity.ups > MIN_UPS:
                            images = [
                                random.choice(
                                    self.client.get_album_images(entity.id))
                            ]
                        else:
                            images = []
                    else:
                        images = [entity]

                    for image in images:
                        if self.check(image):
                            self.toggled.add(hash(normalize(image.title)))
                            yield PostSource(image.link,
                                             self.process_title(image.title))
        except Exception as e:
            log.exception(e)
            return
コード例 #4
0
    def check(self, image):
        if not image.title or hash(normalize(image.title)) in self.toggled or \
                        image.height < 500 or image.width < 500:
            return False

        copies = self.get_copies(image.id)
        if len(copies) == 0:
            return True
コード例 #5
0
ファイル: imgur_gen.py プロジェクト: cash2one/generators
    def check(self, image):
        if not image.title or hash(normalize(image.title)) in self.toggled or \
                        image.height < 500 or image.width < 500:
            return False

        copies = self.get_copies(image.id)
        if len(copies) == 0:
            return True
コード例 #6
0
def is_valid_title(title):
    words = normalize(title, serialise=lambda x: x)
    if len(words) < MIN_WORDS_IN_TITLE:
        return False
    if set(words).intersection(title_stop_words):
        return False
    for validator in title_bad_validators:
        if validator.findall(title):
            return False
    return True
コード例 #7
0
ファイル: copy_gen.py プロジェクト: cash2one/generators
def is_valid_title(title):
    words = normalize(title, serialise=lambda x: x)
    if len(words) < MIN_WORDS_IN_TITLE:
        return False
    if set(words).intersection(title_stop_words):
        return False
    for validator in title_bad_validators:
        if validator.findall(title):
            return False
    return True
コード例 #8
0
    def generate_data(self, subreddit, key_words):
        try:#todo fix rate limit
            #todo add posts statuses
            for page in xrange(MAX_PAGES):
                q = "tag:%s OR title:%s OR album:%s" % (subreddit, subreddit, subreddit)
                log.info("retrieve for %s at page %s" % (subreddit, page))

                for entity in self.client.gallery_search(q=q, sort='time', page=page, window='week'):
                    if entity.is_album:
                        if entity.ups - entity.downs > 0 and entity.ups > MIN_UPS:
                            images = [random.choice(self.client.get_album_images(entity.id))]
                        else:
                            images=[]
                    else:
                        images = [entity]

                    for image in images:
                        if self.check(image):
                            self.toggled.add(hash(normalize(image.title)))
                            yield PostSource(image.link, self.process_title(image.title))
        except Exception as e:
            log.exception(e)
            return
コード例 #9
0
ファイル: validate.py プロジェクト: AlexeyProskuryakov/read
def check_comment_text(text, existed_comments_texts, exclude_words):
    """
    Checking in db, and by is good and found similar text in post comments.
    Similar it is when tokens (only words) have equal length and full intersection
    :param text:
    :param post:
    :return:
    """
    try:
        if is_good_text(text):
            ok, c_tokens = check_on_exclude(text, exclude_words)
            if ok:
                for comment in existed_comments_texts:
                    p_text = comment.body
                    if is_good_text(p_text):
                        p_tokens = set(normalize(p_text))
                        if len(c_tokens) == len(p_tokens) and len(p_tokens.intersection(c_tokens)) == len(p_tokens):
                            return False, None

                return True, c_tokens
    except Exception as e:
        log.exception(e)

    return False, None
コード例 #10
0
 def check_title(title):
     url_tokens = normalize(url, lambda x: x)
     title_tokens = normalize(title, lambda x: x)
     if len(set(url_tokens).intersection(set(title_tokens))) > 0:
         return False
     return True
コード例 #11
0
ファイル: copy_gen.py プロジェクト: cash2one/generators
 def check_title(title):
     url_tokens = normalize(url, lambda x: x)
     title_tokens = normalize(title, lambda x: x)
     if len(set(url_tokens).intersection(set(title_tokens))) > 0:
         return False
     return True
コード例 #12
0
ファイル: validate.py プロジェクト: AlexeyProskuryakov/read
def check_on_exclude(text, exclude_dict):
    c_tokens = set(normalize(text))
    for token in c_tokens:
        if hash(token) in exclude_dict:
            return False, None
    return True, c_tokens