Esempio n. 1
0
def post_processing(mongo, batch_size=100, max_workers=50):
    indexer = Indexer(mongo)
    start_block = indexer.get_checkpoint('post_processing')

    query = {
        "block_num": {
            "$gt": start_block,
            "$lte": start_block + batch_size,
        }
    }
    projection = {
        '_id': 0,
        'body': 0,
        'json_metadata': 0,
    }
    results = list(mongo.Operations.find(query, projection=projection))
    batches = map(parse_operation, results)

    # handle an edge case when we are too close to the head,
    # and the batch contains no work to do
    if not results and is_recent(start_block, days=1):
        return

    # squash for duplicates
    def custom_merge(*args):
        return list(set(keep(flatten(args))))

    batch_items = merge_with(custom_merge, *batches)

    # only process accounts if the blocks are recent
    # scrape_all_users should take care of stale updates
    if is_recent(start_block, days=10):
        accounts = set(batch_items['accounts_light'] +
                       batch_items['accounts'])
        list(thread_multi(
            fn=update_account,
            fn_args=[mongo, None],
            dep_args=list(accounts),
            fn_kwargs=dict(load_extras=False),
            max_workers=max_workers,
            re_raise_errors=False,
        ))
        list(thread_multi(
            fn=update_account_ops_quick,
            fn_args=[mongo, None],
            dep_args=list(accounts),
            fn_kwargs=None,
            max_workers=max_workers,
            re_raise_errors=False,
        ))

    index = max(lpluck('block_num', results))
    indexer.set_checkpoint('post_processing', index)

    log.info("Checkpoint: %s - %s accounts (+%s full)" % (
        index,
        len(batch_items['accounts_light']),
        len(batch_items['accounts']),
    ))
Esempio n. 2
0
def batch_update_async(batch_items: dict):
    # todo break this batch into posts and account updates

    # if we're lagging by a large margin, don't bother updating accounts
    lag = time_delta(find_latest_item(mongo, 'Posts', 'created'))
    if lag > 1000:
        return

    if use_multi_threading:
        with log_exceptions():
            thread_multi(
                fn=update_account,
                fn_args=[mongo, None],
                dep_args=batch_items['accounts_light'],
                fn_kwargs=dict(load_extras=False),
                max_workers=num_threads,
            )
            thread_multi(
                fn=update_account_ops_quick,
                fn_args=[mongo, None],
                dep_args=batch_items['accounts_light'],
                fn_kwargs=None,
                max_workers=num_threads,
            )
    else:
        for account_name in batch_items['accounts_light']:
            with log_exceptions():
                update_account(mongo, account_name, load_extras=False)
                update_account_ops_quick(mongo, account_name)

    if use_multi_threading:
        with log_exceptions():
            thread_multi(
                fn=update_account,
                fn_args=[mongo, None],
                dep_args=batch_items['accounts'],
                fn_kwargs=dict(load_extras=True),
                max_workers=num_threads,
            )
            thread_multi(
                fn=update_account_ops_quick,
                fn_args=[mongo, None],
                dep_args=batch_items['accounts'],
                fn_kwargs=None,
                max_workers=num_threads,
            )
    else:
        for account_name in batch_items['accounts']:
            with log_exceptions():
                update_account(mongo, account_name, load_extras=True)
                update_account_ops_quick(mongo, account_name)
Esempio n. 3
0
def scrape_comments(mongo, batch_size=250, max_workers=50):
    """ Parse operations and post-process for comment/post extraction. """
    indexer = Indexer(mongo)
    start_block = indexer.get_checkpoint('comments')

    query = {
        "type": "comment",
        "block_num": {
            "$gt": start_block,
            "$lte": start_block + batch_size,
        }
    }
    projection = {
        '_id': 0,
        'block_num': 1,
        'author': 1,
        'permlink': 1,
    }
    results = list(mongo.Operations.find(query, projection=projection))
    identifiers = set(f"{x['author']}/{x['permlink']}" for x in results)

    # handle an edge case when we are too close to the head,
    # and the batch contains no work to do
    if not results and is_recent(start_block, days=1):
        return

    # get Post.export() results in parallel
    raw_comments = thread_multi(fn=get_comment,
                                fn_args=[None],
                                dep_args=list(identifiers),
                                max_workers=max_workers,
                                yield_results=True)
    raw_comments = lkeep(raw_comments)

    # split into root posts and comments
    posts = lfilter(lambda x: x['depth'] == 0, raw_comments)
    comments = lfilter(lambda x: x['depth'] > 0, raw_comments)

    # Mongo upsert many
    log_output = ''
    if posts:
        r = mongo.Posts.bulk_write(
            [
                UpdateOne({'identifier': x['identifier']},
                          {'$set': {
                              **x, 'updatedAt': dt.datetime.utcnow()
                          }},
                          upsert=True) for x in posts
            ],
            ordered=False,
        )
        log_output += \
            f'(Posts: {r.upserted_count} upserted, {r.modified_count} modified) '
    if comments:
        r = mongo.Comments.bulk_write(
            [
                UpdateOne({'identifier': x['identifier']},
                          {'$set': {
                              **x, 'updatedAt': dt.datetime.utcnow()
                          }},
                          upsert=True) for x in comments
            ],
            ordered=False,
        )
        log_output += \
            f'(Comments: {r.upserted_count} upserted, {r.modified_count} modified) '

    # We are only querying {type: 'comment'} blocks and sometimes
    # the gaps are larger than the batch_size.
    index = silent(max)(lpluck('block_num',
                               results)) or (start_block + batch_size)
    indexer.set_checkpoint('comments', index)

    log.info(f'Checkpoint: {index} {log_output}')