def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang='', use_proxies=False): no_days = (enddate - begindate).days if (no_days < 0): sys.exit('Begin date must occur before end date.') if poolsize > no_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = no_days dateranges = [ begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize + 1) ] if limit and poolsize: limit_per_pool = (limit // poolsize) + 1 else: limit_per_pool = None # If we are setting pool size to 1, add a pause between requests to avoid IP ban by Twitter. throttled = poolsize == 1 and not use_proxies queries = [ '{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:]) ] all_tweets = [] try: pool = Pool(poolsize) logger.info('queries: {}'.format(queries)) try: for new_tweets in pool.imap_unordered( partial(query_tweets_once, throttled=throttled, limit=limit_per_pool, lang=lang, use_proxies=use_proxies), queries): all_tweets.extend(new_tweets) logger.info('Got {} tweets ({} new).'.format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logger.info('Program interrupted by user. Returning all tweets ' 'gathered so far.') finally: pool.close() pool.join() return all_tweets
class MultiprocessingDistributor(DistributorBaseClass): """ Distributor using a multiprocessing Pool to calculate the jobs in parallel on the local machine. """ def __init__(self, n_workers, disable_progressbar=False, progressbar_title="Feature Extraction", show_warnings=True): """ Creates a new MultiprocessingDistributor instance :param n_workers: How many workers should the multiprocessing pool have? :type n_workers: int :param disable_progressbar: whether to show a progressbar or not. :type disable_progressbar: bool :param progressbar_title: the title of the progressbar :type progressbar_title: basestring :param show_warnings: whether to show warnings or not. :type show_warnings: bool """ self.pool = Pool(processes=n_workers, initializer=initialize_warnings_in_workers, initargs=(show_warnings, )) self.n_workers = n_workers self.disable_progressbar = disable_progressbar self.progressbar_title = progressbar_title def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to a thread pool :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ return self.pool.imap_unordered(partial(func, **kwargs), partitioned_chunks) def close(self): """ Collects the result from the workers and closes the thread pool. """ self.pool.close() self.pool.terminate() self.pool.join()
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.datetime.now(), poolsize=20, lang=''): no_secs = (enddate - begindate).seconds if (no_secs < 0): sys.exit('Begin date must occur before end date.') if poolsize > no_secs: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = no_secs dateranges = [ begindate + dt.timedelta(seconds=elem) for elem in linspace(0, no_secs, poolsize + 1) ] if limit and poolsize: limit_per_pool = (limit // poolsize) + 1 else: limit_per_pool = None queries = [ '{} since_time:{} until_time:{}'.format( query, int(time.mktime(since.timetuple())), int(time.mktime(until.timetuple()))) for since, until in zip(dateranges[:-1], dateranges[1:]) ] all_tweets = [] try: pool = Pool(poolsize) logger.info('queries: {}'.format(queries)) try: for new_tweets in pool.imap_unordered( partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) logger.info('Got {} tweets ({} new).'.format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logger.info('Program interrupted by user. Returning all tweets ' 'gathered so far.') finally: pool.close() pool.join() return all_tweets
def query_tweets_parallel(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''): number_days = (enddate - begindate).days if poolsize > number_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = number_days dateranges = [ begindate + dt.timedelta(days=elem) for elem in linspace(0, number_days, poolsize + 1) ] if limit and poolsize: limit_per_pool = (limit // poolsize) + 1 else: limit_per_pool = None queries = [ '{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:]) ] all_tweets = [] try: pool = Pool(poolsize) logger.info('queries: {}'.format(queries)) try: for new_tweets in pool.imap_unordered( partial(query_tweets_once, limit=limit_per_pool, lang=lang, use_proxy=use_proxy), queries): all_tweets.extend(new_tweets) logger.info('Got {} tweets ({} new).'.format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logger.info('Program interrupted by user. Returning all tweets ' 'gathered so far.') finally: pool.close() pool.join() return all_tweets
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''): no_days = (enddate - begindate).days if (no_days < 0): sys.exit('Begin date must occur before end date.') if poolsize > no_days: poolsize = no_days dateranges = [ begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize + 1) ] if limit and poolsize: limit_per_pool = (limit // poolsize) + 1 else: limit_per_pool = None queries = [ '{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:]) ] all_tweets = [] try: pool = Pool(poolsize) for new_tweets in pool.imap_unordered( partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) finally: pool.close() pool.join() return all_tweets
def scrape(self, keywords): all_tweets = [] pool_size = 20 start_date = dt.date.today() - dt.timedelta(14) query = " ".join(keywords) no_of_days = (dt.date.today() - start_date).days if no_of_days < pool_size: pool_size = no_of_days date_ranges = [ start_date + dt.timedelta(days=elem) for elem in np.linspace(0, no_of_days, pool_size + 1) ] if self.limit and pool_size: self.limit = (self.limit // pool_size) + 1 queries = [ "{} since:{} until:{}".format(query, since, until) for since, until in zip(date_ranges[:-1], date_ranges[1:]) ] pool = Pool(pool_size) logging.info("queries: {}".format(queries)) try: for new_tweets in pool.imap_unordered(self.get_tweets, queries): all_tweets.extend(new_tweets) except KeyboardInterrupt: logging.info( "Program interrupted by user. Returning all tweets " "gathered so far." ) finally: pool.close() pool.join() return all_tweets