def predict(self, document_frame): logger.info('Predicting {} values'.format(len(document_frame))) values = self.transform(document_frame) results = np.zeros((len(values), self.trainY.shape[1])) logger.info('Finding {} nearest neighbors'.format(self.knn)) for idx in range(len(values)): vector = values[idx, :] returns = self.model.docvecs.most_similar(positive=[vector], topn=self.knn) indices = [doctag for doctag, sim in returns] mean_vals = self.trainY.loc[indices, :].mean() results[idx, :] = mean_vals progressbar(idx, len(values), logger=logger) return results
def transform(self, document_frame): dim = self.model.vector_size inputs = np.zeros((len(document_frame), dim)) logger.info('Transforming documents into matrix of ' 'shape {}'.format(inputs.shape)) tagged_docs = self.create_tagged_documents(document_frame) for kdx, (author, permalink) in enumerate( zip(document_frame.author, document_frame.permalink)): try: inputs[kdx, :] = self.model.docvecs[author + '/' + permalink] except KeyError: # infer the test vector inputs[kdx, :] = self.model.infer_vector( tagged_docs[kdx].words, steps=self.infer_steps) progressbar(kdx, len(inputs), logger=logger) return inputs
def get_all_posts_between_parallel(start_datetime, end_datetime, steem, stop_after=None, ncores=8, chunksize=20, timeout=1200): """As above but in parallel with `ncores` jobs of `chunksize`. Waits for posts unitl `timeout`. """ start_num, block_start_datetime = find_nearest_block_num(start_datetime, steem) end_num, block_end_datetime = find_nearest_block_num(end_datetime, steem) logger.info('Querying IN PARALLEL with {} cores all posts between ' '{} (block {}) and {} (block {})'.format(ncores, block_start_datetime, start_num, block_end_datetime, end_num)) block_nums = list(range(start_num, end_num + 1)) chunks = [block_nums[irun: irun + chunksize] for irun in range(0, len(block_nums), chunksize)] ctx = mp.get_context('spawn') pool = ctx.Pool(ncores, initializer=config_mp_logging) async_results = [] for idx, chunk in enumerate(chunks): result = pool.apply_async(_get_all_posts_for_blocks_parallel, args=(chunk, steem, stop_after)) async_results.append(result) if stop_after is not None and idx >= stop_after: break pool.close() posts = [] terminate = False for kdx, async in enumerate(async_results): try: new_posts = async.get(timeout=timeout) posts.extend(new_posts) if progressbar(kdx, len(chunks), percentage_step=5, logger=logger): logger.info('Finished chunk {} ' 'out of {} found so far {} ' 'posts...'.format(kdx + 1, len(chunks), len(posts))) except Exception as e: logger.exception('Something went totally wrong dude!') terminate = True if terminate: logger.error('Terminating pool due to timeout or errors') pool.terminate() pool.join() return posts
def check_all_ops_between(start_datetime, end_datetime, steem, account, stop_after=None): """ Queries all posts found in blocks between start and end Parameters ---------- start_datetime: datetime end_datetime: datetime steem: Steem account: str stop_after: int or None For debugging Returns ------- List of dicts of posts """ start_num, block_start_datetime = tpbg.find_nearest_block_num( start_datetime, steem) end_num, block_end_datetime = tpbg.find_nearest_block_num( end_datetime, steem) total = end_num - start_num comment_authors_and_permalinks = [] logger.info('Checking all operations for account {} between ' '{} (block {}) and {} (block {})'.format( account, block_start_datetime, start_num, block_end_datetime, end_num)) for idx, block_num in enumerate(range(start_num, end_num + 1)): authors_and_permalinks = check_all_ops_in_block( block_num, steem, account) comment_authors_and_permalinks.extend(authors_and_permalinks) if progressbar(idx, total, percentage_step=1, logger=logger): logger.info('Finished block {} ' '(last is {}) found so far {} ' 'comments mentioning me...'.format( block_num, end_num, len(comment_authors_and_permalinks))) if stop_after is not None and idx >= stop_after: break logger.info('Scraped {} comments mentioning me'.format( len(comment_authors_and_permalinks))) return comment_authors_and_permalinks
def get_all_posts_between(start_datetime, end_datetime, steem, stop_after=None): """ Queries all posts found in blocks between start and end Parameters ---------- start_datetime: datetime end_datetime: datetime steem: Steem stop_after: int or None For debugging and shorter tests, stop after only a few iterations Returns ------- List of dicts of posts """ start_num, block_start_datetime = find_nearest_block_num(start_datetime, steem) end_num, block_end_datetime = find_nearest_block_num(end_datetime, steem) total = end_num - start_num posts = [] logger.info('Querying all posts between ' '{} (block {}) and {} (block {})'.format(block_start_datetime, start_num, block_end_datetime, end_num)) exclude_authors_and_permalinks = set() for idx, block_num in enumerate(range(start_num, end_num+1)): posts_in_block, authors_and_permalinks = get_all_posts_from_block(block_num, steem, exclude_authors_and_permalinks) exclude_authors_and_permalinks |= authors_and_permalinks posts.extend(posts_in_block) if progressbar(idx, total, percentage_step=1, logger=logger): logger.info('Finished block {} ' '(last is {}) found so far {} ' 'posts...'.format(block_num, end_num, len(posts))) if stop_after is not None and len(posts) >= stop_after: break logger.info('Scraped {} posts'.format(len(posts))) return posts
def get_upvote_payments_for_accounts(accounts, steem, min_datetime, max_datetime, chunksize=10, ncores=20, timeout=3600): logger.info('Querying upvote purchases between {} and ' '{} for {} accounts'.format(min_datetime, max_datetime, len(accounts))) # do queries by day! start_datetimes = pd.date_range(min_datetime, max_datetime).tolist() end_datetimes = [x for x in start_datetimes[1:]] + [max_datetime] if ncores > 1: chunks = [ accounts[irun:irun + chunksize] for irun in range(0, len(accounts), chunksize) ] ctx = mp.get_context('spawn') pool = ctx.Pool(ncores, initializer=tpbg.config_mp_logging) async_results = [] for start_datetime, end_datetime in zip(start_datetimes, end_datetimes): for idx, chunk in enumerate(chunks): result = pool.apply_async(_get_upvote_payments_parrallel, args=(chunk, steem, start_datetime, end_datetime)) async_results.append(result) pool.close() upvote_payments = {} terminate = False for kdx, async in enumerate(async_results): try: payments = async .get(timeout=timeout) upvote_payments = extend_upvotes_and_payments( upvote_payments, payments) if progressbar(kdx, len(async_results), percentage_step=5, logger=logger): logger.info('Finished chunk {} ' 'out of {} found so far {} ' 'upvote buyers...'.format( kdx + 1, len(async_results), len(upvote_payments))) except Exception as e: logger.exception('Something went totally wrong dude!') terminate = True if terminate: logger.error('Terminating pool due to timeout or errors') pool.terminate() pool.join() else: return _get_upvote_payments_parrallel(accounts, steem, min_datetime, max_datetime) logger.info('Found {} upvote bought articles'.format(len(upvote_payments))) return upvote_payments
def test_progressbar(): result = [] for irun in range(100): result.append(progressbar(irun, 100, percentage_step=1)) assert all(result)