def main(time_frame, max_request_per_time_frame, mongo_coll, search_params, max_id, termination_function): tcs = TwitterCrawler(time_frame=time_frame, max_requests=max_request_per_time_frame) tcs.connect(mongo_coll) tcs.authenticate("../api_key.json") tcs.set_search_arguments(search_args=search_params) tcs.search_by_query(wait_for=3, current_max_id=max_id, term_func=termination_function) tcs.close()
def collect_tweets_by_search_terms(search_configs_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() search_configs = {} with open(os.path.abspath(search_configs_filepath), 'r') as search_configs_rf: search_configs = json.load(search_configs_rf) for search_config_id in itertools.cycle(search_configs): search_config = search_configs[search_config_id] search_terms = [term.lower() for term in search_config['terms']] querystring = '%s' % (' OR '.join('(' + term + ')' for term in search_terms)) since_id = search_config[ 'since_id'] if 'since_id' in search_config else 0 geocode = tuple(search_config['geocode']) if ( 'geocode' in search_config and search_config['geocode']) else None logger.info( 'REQUEST -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])' % (util.md5(querystring.encode('utf-8')), since_id, geocode)) try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder=output_folder) since_id = twitterCralwer.search_by_query(querystring, geocode=geocode, since_id=since_id) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass search_config['since_id'] = since_id search_config['querystring'] = querystring search_config['geocode'] = geocode search_configs[search_config_id] = search_config flash_cmd_config(search_configs, search_configs_filepath, output_folder) logger.info( 'COMPLETED -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])' % (util.md5(querystring.encode('utf-8')), since_id, geocode)) logger.info('PAUSE %ds to CONTINUE...' % WAIT_TIME) time.sleep(WAIT_TIME)
def collect_tweets_by_search_terms(search_configs_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() search_configs = {} with open(os.path.abspath(search_configs_filepath), 'r') as search_configs_rf: search_configs = json.load(search_configs_rf) for search_config_id in itertools.cycle(search_configs): search_config = search_configs[search_config_id] search_terms = [term.lower() for term in search_config['terms']] querystring = '%s'%(' OR '.join('(' + term + ')' for term in search_terms)) since_id = search_config['since_id'] if 'since_id' in search_config else 0 geocode = tuple(search_config['geocode']) if ('geocode' in search_config and search_config['geocode']) else None logger.info('REQUEST -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])'%(util.md5(querystring.encode('utf-8')), since_id, geocode)) try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder = output_folder) since_id = twitterCralwer.search_by_query(querystring, geocode = geocode, since_id = since_id) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) pass search_config['since_id'] = since_id search_config['querystring'] = querystring search_config['geocode'] = geocode search_configs[search_config_id] = search_config flash_cmd_config(search_configs, search_configs_filepath, output_folder) logger.info('COMPLETED -> (md5(querystring): [%s]; since_id: [%d]; geocode: [%s])'%(util.md5(querystring.encode('utf-8')), since_id, geocode)) logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME) time.sleep(WAIT_TIME)