Esempio n. 1
0
    def __init__(self,
                 db_address,
                 load_model_name=None,
                 model_config_name=None):
        self.posts = set()
        self.past_votes_len = 0

        self.db_address = db_address

        db_blacklist, _ = load_data(self.db_address, BLACK_LIST_COL)
        self.blacklist = set(x['_id'] for x in db_blacklist)
        logger.info("Blacklist loaded: %d", len(self.blacklist))

        if load_model_name:
            from steevebase.model import models

            model_config = load_training_config(model_config_name)

            self.output_size = model_config['MODEL']['OUTPUT_SIZE']

            DATA_CONFIG = model_config['DATA']
            self.max_seq_len = DATA_CONFIG['MAX_SEQ_LEN']
            self.embedding_size = len(
                DATA_CONFIG['ALPHABET']
            ) + 1  # 1 symbol reserved for unknown chars

            self.model_class = models.create_cnn_class(model_config)
            self.model_class.load_weights(load_model_name)
            logger.info("Model %s loaded..." % load_model_name)

            self.char2ind, _ = get_char_ind_dicts(
                model_config['DATA']['ALPHABET'])
            self.probability = 0.
Esempio n. 2
0
    def __init__(self,
                 db_address=None,
                 load_model_name=None,
                 model_config_name=None):
        if db_address:
            self.posts = set()
            self.past_votes_len = 0

            self.db_address = db_address

            db_blacklist, _ = load_data(self.db_address, BLACK_LIST_COL)
            # self.blacklist = set(x['_id'] for x in db_blacklist if 'reason' not in x or x['reason'] != 'unsubscribe')
            self.blacklist = set(x['_id'] for x in db_blacklist)
            logger.info("Blacklist loaded: %d", len(self.blacklist))

            self.spam_check = None

        if load_model_name:
            # init spam check
            init_time_interval_hrs = 24

            self.spam_check = SpamCheck(
                db_address=db_address,
                init_time_interval_hrs=init_time_interval_hrs)
            logger.info('SpamCheck init on last %d hours',
                        init_time_interval_hrs)

            from steevebase.model import models

            model_config = load_training_config(model_config_name)

            self.output_size = model_config['MODEL']['OUTPUT_SIZE']

            DATA_CONFIG = model_config['DATA']
            self.max_seq_len = DATA_CONFIG['MAX_SEQ_LEN']
            self.embedding_size = len(
                DATA_CONFIG['ALPHABET']
            ) + 1  # 1 symbol reserved for unknown chars

            self.model_class = models.create_cnn_class(model_config)
            self.model_class.load_weights(load_model_name)
            logger.info("Model %s loaded..." % load_model_name)

            self.char2ind, _ = get_char_ind_dicts(
                model_config['DATA']['ALPHABET'])
            self.probability = 0.
Esempio n. 3
0
    def __init__(self, args):
        assert args.db_address.startswith("mongodb://")

        self.checks = Checks(args.db_address, args.load_model_name,
                             args.model_config_name)

        self.posts = set()
        self.df_current_posts = pd.DataFrame({'post': [], 'pending_times': []})
        self.df_past_votes = pd.DataFrame({
            'vote_time':
            pd.Series(dtype='datetime64[ns]'),
            '_id': []
        })

        self.db_address = args.db_address
        time_now = datetime.utcnow()
        minusdelta24 = time_now - timedelta(hours=24)

        query = {"vote_time": {"$gt": minusdelta24}}
        old_voted_posts_iter, _ = load_data(self.db_address, PAST_VOTES, query)
        for post in old_voted_posts_iter:
            self.df_past_votes = self.df_past_votes.append(
                {
                    'vote_time': int(post["vote_time"].timestamp()),
                    '_id': post['_id']
                },
                ignore_index=True)

        logger.info("Past votes loaded: %d", len(self.df_past_votes))

        self.steem_address = args.steem_address
        self.wif = WIF

        start_block_number = get_steem_info(
            self.steem_address)['head_block_number'] - int(
                VOTING_DELAY / BLOCK_INTERVAL)
        self.stream = stream_ops('comment',
                                 steem_address=self.steem_address,
                                 wif=self.wif,
                                 start_block_number=start_block_number)

        self.a_lock = allocate_lock()
Esempio n. 4
0
    def check_author_not_spamming(self, post):
        try:
            author = post['author']
            created = post['created']
        except PostDoesNotExist:
            return False

        # Check whether called in Voter or in Wrangler - not nice :(
        if datetime.utcnow() - created >= timedelta(
                hours=DELAY / (3600 / 3)):  # DELAY in number of blocks
            return True

        # find out how many posts were created by the author in the last 24 hours
        start_time = datetime.utcnow() - timedelta(hours=24)
        last_24_hrs = {'$gte': start_time}
        query = {'author': author, 'created': last_24_hrs}
        _, count = load_data(self.db_address, RAW_POSTS_COL, query)

        if count > 2 * SPAM_LIMIT:
            self.add_to_blacklist(author, count)

        return count < SPAM_LIMIT
Esempio n. 5
0
def wrangler_iterator(input_name,
                      blacklist_db_address,
                      block_step=BLOCK_STEP,
                      query=None):
    posts_count = 0
    filtered_count = 0

    checks = Checks(db_address=blacklist_db_address)

    raw_posts_iter, raw_posts_count = load_data(
        input_name, col_name=RAW_POSTS_COLLECTION_NAME, query=query)

    num_batches = math.ceil(raw_posts_count / block_step)

    for batch in range(num_batches):
        step = min(block_step, raw_posts_count - block_step * batch)

        # download & clean
        posts = [clean_post(next(raw_posts_iter)) for i in range(step)]
        posts_count += len(posts)

        # filter
        filtered_posts = [
            post for post in posts
            if checks.check_conditions(post, Checks.checklist_all)
        ]
        filtered_count += len(filtered_posts)

        # transform
        transformed_posts = [extract_features(post) for post in filtered_posts]

        logger.info("Number of posts processed: %d / %d", posts_count,
                    raw_posts_count)
        yield transformed_posts

    logger.info("Total clean posts: %d / %d", filtered_count, raw_posts_count)
Esempio n. 6
0
def fetch_training_data(mongo_address, clean_posts_col_name, start_time):
    query = {'created': {'$gte': start_time}}
    return load_data(mongo_address, clean_posts_col_name, query)