def _stream_tweets_by_user_id(self, id_, **kwargs): # TODO rework this to use min/max tweets instead of assuming < 200 # means done kwargs = dict( user_id=id_, count=200 ) # TODO consider breaking up/refactoring while True: try: l.INFO("Fetching 200 tweets %s" % (kwargs)) tweets = self.api.GetUserTimeline(**kwargs) except Exception as e: l.WARN("%s kwargs %s" % (e, kwargs)) return None l.INFO("Streaming tweets") for tweet in tweets: self.on_tweet(tweet) if len(tweets) < 200: # TODO Fix - Using <200 as proxy for end of user timeline l.INFO("Stream ended < 200 tweets") break tweet_ids = [tweet.id for tweet in tweets] if len(tweet_ids) > 0: # Next request start at oldest tweet in current request l.INFO("Setting max ID: {}".format(min(tweet_ids))) kwargs['max_id'] = min(tweet_ids)
def on_page_source(self): soup = BeautifulSoup(self.page_source) elements = soup.findAll("div", {"class": "rg_meta"}) for element in elements: meta_data_str = element.text try: meta_data = json.loads(meta_data_str) self.on_entry(meta_data) except Exception as e: l.WARN(e)
def cli(ctx, config): csv_config = {} try: with open(config, 'r') as config_file: csv_config = json.load(config_file) l.INFO("Using custom CSV configuration: %s" % (csv_config)) except TypeError: l.WARN("Using default CSV configuration: %s" % (CSV_DEFAULT_CONFIG)) input_ = click.get_text_stream('stdin') convert(input_, configuration=csv_config)
def cli(ctx, config): s3_config = {} try: with open(config, 'r') as config_file: s3_config = json.load(config_file) l.INFO("Using custom CSV configuration: %s" % (s3_config)) except TypeError: l.WARN("Unable to parse s3 config") input_ = click.get_text_stream('stdin') convert(input_, configuration=s3_config)
def convert(lines, configuration): access_key_id = str(configuration['aws_access_key_id']) secret_access_key = str(configuration['aws_secret_access_key']) bucket = str(configuration['bucket']) cfg_filename = str(configuration.get('output_file', '')) s3_client = S3Client(access_key_id, secret_access_key) targets = {} for line in lines: try: data = json.loads(line) except Exception as e: raise Exception(errors.PARSING_ERROR % (line, e)) if 'type' not in data: raise Exception(errors.MISSING_KEY_ERROR % ('type', line)) data_type = data['type'] if data_type == 'RECORD': if 'stream' not in data: raise Exception(errors.MISSING_KEY_ERROR % ('stream', line)) filename = cfg_filename if filename == "": filename = data['stream'] + '.json' target_path = ('s3://{bucket}/{filename}'.format( bucket=bucket, filename=filename)) record = data['record'] print(targets) target = None if not target_path in targets.keys(): target = S3Target(target_path, client=s3_client) targets[target_path] = { 'target': target, 'file': target.open('w') } target = targets[target_path]['target'] targets[target_path]['file'].write(json.dumps(record) + '\n') else: l.WARN(errors.UNEXPECTED_MESSAGE_TYPE % (data['type'], data)) for target_path in targets: targets[target_path]['file'].close()
def update_page_source(self): url = self.build_search_url() driver = Chrome() driver.get(url) num_scrolls = 0 try: while num_scrolls < self.scroll_max: driver.execute_script(random_js_scroll()) self.page_source = driver.page_source random_sleep() num_scrolls += 1 except Exception as e: l.WARN(e) driver.close()
def convert(lines, configuration): cfg_filename = str(configuration.get('output_file', "")) delimiter = str(configuration.get('delimiter', ',')) quotechar = str(configuration.get('quotechar', '"')) for line in lines: try: data = json.loads(line) except Exception as e: raise Exception(errors.PARSING_ERROR % (line, e)) if 'type' not in data: raise Exception(errors.MISSING_KEY_ERROR % ('type', line)) data_type = data['type'] if data_type == 'RECORD': if 'stream' not in data: raise Exception(errors.MISSING_KEY_ERROR % ('stream', line)) filename = cfg_filename if filename == "": filename = data['stream'] + '.csv' flattened_record = flatten(data['record']) header = flattened_record.keys() with open(filename, 'a') as output_file: writer = csv.DictWriter(output_file, header, extrasaction='ignore', delimiter=delimiter, quotechar=quotechar) if is_file_empty(filename): writer.writeheader() writer.writerow(flattened_record) else: l.WARN(errors.UNEXPECTED_MESSAGE_TYPE % (data['type'], data))
def _stream_tweets(self, user_id=None, screen_name=None, limit=3200): # TODO rework this to use min/max tweets instead of assuming < 200 # means done kwargs = dict( count=200 ) tweets_gathered = 0 while True: try: l.INFO("Fetching 200 tweets %s" % (kwargs)) tweets = self.api.GetUserTimeline(**kwargs) tweets_gathered += len(tweets) except Exception as e: l.WARN("%s kwargs %s" % (e, kwargs)) return None l.INFO("Streaming tweets") for tweet in tweets: self.on_tweet(tweet) if tweets_gathered >= limit: l.INFO("Per user limit hit {} tweets gathered".format(limit)) break if len(tweets) < 200: # TODO Fix - Using <200 as proxy for end of user timeline l.INFO("Stream ended < 200 tweets") break tweet_ids = [tweet.id for tweet in tweets] if len(tweet_ids) > 0: # Next request start at oldest tweet in current request l.INFO("Setting max ID: {}".format(min(tweet_ids))) kwargs['max_id'] = min(tweet_ids)
def cli(ctx, scroll_max, from_file, from_pipe): keywords = [] if not from_file is None: reader = csv.reader(from_file) for row in reader: keywords.append(row[0]) if from_pipe: try: stdin_text = ( click.get_text_stream('stdin').read().strip()).split('\n') for line in stdin_text: keywords.append(line) except Exception as e: raise RuntimeError("Error while reading pipe: %s" % (e)) if len(keywords) == 0: l.WARN("Nothing to search, got: %s" % (keywords)) crawler = GoogleImageCrawler(task_cls=GoogleImageMetaDataLogger, queue_data=keywords, scroll_max=scroll_max) crawler.start()
def insert_lines(collection, lines): for line in lines: try: data = json.loads(line) except Exception as e: raise Exception(PARSING_ERROR % (line, e)) if 'type' not in data: raise Exception(MISSING_KEY_ERROR % ('type', line)) data_type = data['type'] if data_type == 'RECORD': if 'stream' not in data: raise Exception(MISSING_KEY_ERROR % ('stream', line)) record = data['record'] collection.insert_one(record) else: l.WARN(""" Unexpected message type %s in message %s """ % (data['type'], data))