Ejemplo n.º 1
0
def main():

    format = ('%(asctime)s %(processName)s:%(name)s:'
              '%(funcName)s:%(lineno)s:%(levelname)s: %(message)s')
    logging.basicConfig(level=logging.INFO, format=format)
    directory = os.path.join(config.PROJECT_DIRECTORY, 'scraped_data')

    steem = MPSteem(nodes=config.NODES)
    current_datetime = '2018-02-01'

    crossval_filename = os.path.join(directory,
                                     'xval_{}.gz'.format(current_datetime))

    post_frame = tpgd.load_or_scrape_training_data(
        steem,
        directory,
        current_datetime=current_datetime,
        days=3,
        offset_days=0)

    regressor_kwargs = dict(n_estimators=256,
                            max_leaf_nodes=4096,
                            max_features=0.2,
                            n_jobs=-1,
                            verbose=1,
                            random_state=42,
                            oob_score=True)

    # topic_kwargs = dict(num_topics=32, no_below=5, no_above=0.1)
    doc2vec_kwargs = dict(size=128, epochs=32)

    post_frame = tppp.load_or_preprocess(post_frame,
                                         crossval_filename,
                                         ncores=4,
                                         chunksize=1000,
                                         min_en_prob=0.9)

    param_grid = {
        'feature_generation__topic_model__no_above': [0.33],
        #'regressor__max_leaf_nodes': [500, 1000],
        # 'regressor__max_features': [0.1, 0.2, 0.3]
    }

    # tpmo.cross_validate(post_frame, param_grid, topic_kwargs=topic_kwargs,
    #                     regressor_kwargs=regressor_kwargs, n_iter=None,
    #                     n_jobs=4, targets=['reward'])

    pipe, test_frame = tpmo.train_test_pipeline(
        post_frame,
        #topic_kwargs=topic_kwargs,
        regressor_kwargs=regressor_kwargs,
        doc2vec_kwargs=doc2vec_kwargs,
        targets=['reward', 'votes'])

    # topic_model = pipe.named_steps['feature_generation'].transformer_list[1][1]
    # logging.getLogger().info(topic_model.print_topics(n_best=None))

    tpmo.find_truffles(test_frame, pipe)
Ejemplo n.º 2
0
def test_find_truffles():
    posts = create_n_random_posts(300)

    post_frame = pd.DataFrame(posts)

    regressor_kwargs = dict(n_estimators=20,
                            max_leaf_nodes=100,
                            max_features=0.1,
                            n_jobs=-1,
                            verbose=1,
                            random_state=42)

    topic_kwargs = dict(num_topics=50, no_below=5, no_above=0.7)

    post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50)
    pipeline = tpmo.train_pipeline(post_frame,
                                   topic_kwargs=topic_kwargs,
                                   regressor_kwargs=regressor_kwargs)

    posts = create_n_random_posts(50)

    post_frame = pd.DataFrame(posts)
    post_frame = tppp.preprocess(post_frame, ncores=4, chunksize=50)
    truffles = tpmo.find_truffles(post_frame, pipeline, account='aa')

    assert truffles.iloc[0].rank_score == truffles.rank_score.max()
Ejemplo n.º 3
0
def test_find_truffles_with_real_data(steem):
    df = tpbg.scrape_hour_data(steem, stop_after=20)

    df = tppp.preprocess(df)

    sorted = tpmo.find_truffles(df, MockPipeline())

    assert sorted.rank_score.iloc[0] == sorted.rank_score.max()
Ejemplo n.º 4
0
def main():

    format = ('%(asctime)s %(processName)s:%(name)s:'
              '%(funcName)s:%(lineno)s:%(levelname)s: %(message)s')
    logging.basicConfig(level=logging.INFO, format=format)
    directory = os.path.join(config.PROJECT_DIRECTORY, 'scraped_data')

    crossval_filename = os.path.join(directory, 'xval_first_proto.gz')

    post_frame = pd.read_pickle('../scraped_data/first_post_set.gz')

    regressor_kwargs = dict(n_estimators=256,
                            max_leaf_nodes=1024,
                            max_features=0.3,
                            n_jobs=-1,
                            verbose=1,
                            random_state=42)

    doc2vec_kwargs = dict(size=32, epochs=20)

    post_frame['votes'] = post_frame.reward.astype(int).astype(float)
    post_frame = tppp.load_or_preprocess(post_frame,
                                         crossval_filename,
                                         ncores=4,
                                         chunksize=1000,
                                         min_en_prob=0.9)

    # param_grid = {
    #     #'feature_generation__topic_model__no_above':[0.05, 0.1, 0.2, 0.33],
    #     #'feature_generation__topic_model__num_topics':[50, 100, 200],
    #     'regressor__max_leaf_nodes': [50, 100, 200]
    #    # 'regressor__max_features': [0.1, 0.2, 0.3, 0.66]
    #     }
    #
    # tpmo.cross_validate(post_frame, param_grid, topic_kwargs=topic_kwargs,
    #                     regressor_kwargs=regressor_kwargs, n_iter=None,
    #                     n_jobs=4, targets=['reward'])

    pipe, test_frame = tpmo.train_test_pipeline(
        post_frame,
        doc2vec_kwargs=doc2vec_kwargs,
        regressor_kwargs=regressor_kwargs,
        targets=['reward', 'votes'])

    tpmo.find_truffles(test_frame, pipe)
Ejemplo n.º 5
0
def main():

    logging.basicConfig(level=logging.INFO)

    author, permalink, current_datetime = parse_args()

    if current_datetime is None:
        current_datetime = pd.datetime.utcnow()
    else:
        current_datetime = pd.to_datetime(current_datetime)

    model_directoy = os.path.join(config.PROJECT_DIRECTORY, 'trained_models')

    pipeline = tpmo.load_or_train_pipeline(None, model_directoy,
                                           current_datetime)

    steem = MPSteem(nodes=config.NODES, no_broadcast=True)
    posts = tpgd.get_post_data([(author, permalink)], steem, {})

    posts = pd.DataFrame(posts)

    post_frame = tppp.preprocess(posts)

    tpmo.find_truffles(post_frame, pipeline)
Ejemplo n.º 6
0
def execute_call(comment_authors_and_permalinks, poster, pipeline,
                 topN_permalink, max_comments, overview_permalink):
    """Executes the pig on duty call"""
    ncomments = len(comment_authors_and_permalinks)

    logger.info('Found {} comments mentioning {}'.format(
        ncomments, poster.account))
    if ncomments > max_comments:
        logger.info('To many comments, reducing to {}'.format(max_comments))
        comment_authors_and_permalinks = comment_authors_and_permalinks[:
                                                                        max_comments]

    posts = tpco.get_parent_posts(comment_authors_and_permalinks, poster.steem)

    initial_frame = pd.DataFrame(posts)
    post_frame = initial_frame.copy()

    post_frame = tppp.preprocess(post_frame, ncores=4)

    if len(post_frame):
        truffle_frame = tpmo.find_truffles(post_frame,
                                           pipeline,
                                           k=0,
                                           account='',
                                           add_rank_score=False)
        truffle_frame['passed'] = True
    else:
        truffle_frame = pd.DataFrame()

    filtered_posts = initial_frame[~initial_frame.index.isin(truffle_frame.
                                                             index)]
    filtered_posts['passed'] = False

    combined = pd.concat([truffle_frame, filtered_posts], axis=0)

    topN_link = 'https://steemit.com/@{author}/{permalink}'.format(
        author=poster.account, permalink=topN_permalink)

    tpoc.post_on_call(combined,
                      poster=poster,
                      topN_link=topN_link,
                      overview_permalink=overview_permalink)
Ejemplo n.º 7
0
def main():
    """Main loop started from command line"""

    no_broadcast, current_datetime = parse_args()

    if current_datetime is None:
        current_datetime = pd.datetime.utcnow()
    else:
        current_datetime = pd.to_datetime(current_datetime)

    data_directory = os.path.join(config.PROJECT_DIRECTORY, 'scraped_data')
    model_directoy = os.path.join(config.PROJECT_DIRECTORY, 'trained_models')
    log_directory = os.path.join(config.PROJECT_DIRECTORY, 'logs')

    configure_logging(log_directory, current_datetime)

    logger.info('STARTING main script at {}'.format(current_datetime))
    if no_broadcast:
        logger.info('Run without broadcasting.')
    else:
        logger.info('ATTENTION I WILL BROADCAST TO STEEMIT!!!')
    time.sleep(2)

    steem = MPSteem(nodes=config.NODES, no_broadcast=no_broadcast)
    # hack to allow for payments, because of https://github.com/steemit/steem-python/issues/191
    noapisteem = MPSteem(nodes=config.NODES[1:], no_broadcast=no_broadcast)
    # To post stuff
    account = config.ACCOUNT
    poster = Poster(account=account, steem=noapisteem)

    tppd.create_wallet(steem,
                       config.PASSWORD,
                       posting_key=config.POSTING_KEY,
                       active_key=config.ACTIVE_KEY)

    logger.info('Paying out investors')
    tpde.pay_delegates(
        account=account,
        steem=noapisteem,  # use a steem instance without api.steem!
        current_datetime=current_datetime)

    if not tpmo.model_exists(current_datetime, model_directoy):

        post_frame = load_and_preprocess_2_frames(
            log_directory=log_directory,
            current_datetime=current_datetime,
            steem=steem,
            noapisteem=noapisteem,
            data_directory=data_directory)
        logger.info('Garbage collecting')
        gc.collect()
    else:
        post_frame = None

    regressor_kwargs = dict(n_estimators=256,
                            max_leaf_nodes=5000,
                            max_features=0.2,
                            n_jobs=-1,
                            verbose=1,
                            random_state=42)

    topic_kwargs = dict(num_topics=128,
                        no_below=7,
                        no_above=0.1,
                        ngrams=(1, 2),
                        keep_n=333000)

    if post_frame is not None and len(post_frame) > MAX_DOCUMENTS:
        logger.info('Frame has {} Documents, too many, '
                    'reducing to {}'.format(len(post_frame), MAX_DOCUMENTS))
        post_frame.sort_values('created', inplace=True, ascending=False)
        train_frame = post_frame.iloc[:MAX_DOCUMENTS, :]
    else:
        train_frame = post_frame

    pipeline = tpmo.load_or_train_pipeline(
        train_frame,
        model_directoy,
        current_datetime,
        regressor_kwargs=regressor_kwargs,
        topic_kwargs=topic_kwargs,
        targets=['adjusted_reward', 'adjusted_votes'])

    tpmo.log_pipeline_info(pipeline=pipeline)

    overview_permalink = tppw.return_overview_permalink_if_exists(
        account=account, current_datetime=current_datetime, steem=steem)

    if not overview_permalink:
        if post_frame is None:
            logger.info('Need to reaload data for weekly overview')
            post_frame = load_and_preprocess_2_frames(
                log_directory=log_directory,
                current_datetime=current_datetime,
                steem=steem,
                noapisteem=noapisteem,
                data_directory=data_directory)

        logger.info('I want to post my weekly overview')
        overview_permalink = tppw.post_weakly_update(
            pipeline=pipeline,
            post_frame=post_frame,
            poster=poster,
            current_datetime=current_datetime)

    logger.info('Garbage collecting')
    del post_frame
    gc.collect()

    prediction_frame = tpgd.scrape_hour_data(steem=steem,
                                             current_datetime=current_datetime,
                                             ncores=32,
                                             offset_hours=2)
    prediction_frame = tppp.preprocess(prediction_frame, ncores=8)

    sorted_frame = tpmo.find_truffles(prediction_frame,
                                      pipeline,
                                      account=account)

    permalink = tppd.post_topN_list(sorted_frame,
                                    poster=poster,
                                    current_datetime=current_datetime,
                                    overview_permalink=overview_permalink)

    tppd.comment_on_own_top_list(sorted_frame,
                                 poster=poster,
                                 topN_permalink=permalink)

    tppd.vote_and_comment_on_topK(sorted_frame,
                                  poster=poster,
                                  topN_permalink=permalink,
                                  overview_permalink=overview_permalink)

    logger.info('Computing the top trending without bidbots')
    logger.info('Searching for bid bots and bought votes')
    min_datetime = sorted_frame.created.min()
    max_datetime = sorted_frame.created.max() + pd.Timedelta(days=1)
    upvote_payments, bots = tpad.get_upvote_payments_to_bots(
        steem=noapisteem, min_datetime=min_datetime, max_datetime=max_datetime)
    logger.info('Adjusting votes and reward')
    sorted_frame = tppp.compute_bidbot_correction(
        post_frame=sorted_frame, upvote_payments=upvote_payments)
    tt0b.create_trending_post(sorted_frame,
                              upvote_payments=upvote_payments,
                              poster=poster,
                              topN_permalink=permalink,
                              overview_permalink=overview_permalink,
                              current_datetime=current_datetime,
                              bots=bots)

    logger.info('Done with normal duty, answering manual calls!')
    tfod.call_a_pig(poster=poster,
                    pipeline=pipeline,
                    topN_permalink=permalink,
                    current_datetime=current_datetime,
                    offset_hours=2,
                    hours=24,
                    overview_permalink=overview_permalink)

    logger.info('Cleaning up after myself')
    tfut.clean_up_directory(model_directoy, keep_last=3)
    tfut.clean_up_directory(data_directory, keep_last=25)
    tfut.clean_up_directory(log_directory, keep_last=14)

    logger.info('Preloading -8 days for later training')
    tpgd.load_or_scrape_training_data(steem,
                                      data_directory,
                                      current_datetime=current_datetime,
                                      days=1,
                                      offset_days=8,
                                      ncores=32)

    logger.info('DONE at {}'.format(current_datetime))