def extract_csv(df): '''Creates CSV extract where each row is a Tweet document, using the schema in the twarc.json2csv module. :param df: Spark DataFrame''' column_mapping = make_column_mapping(df.columns, array_fields=['text']) # The hashtags and urls fields are handled differently in the Elasticsearch index and in the CSV (per the twarc.json2csv spec). So we need to drop the ES columns before renaming the CSV-versions of these columns df = df.drop('hashtags', 'urls') for k, v in column_mapping.items(): # Need to convert fields stored as arrays if v[1]: # Concat arrays with whitespace df = df.withColumn(k, F.concat_ws(' ', df[k])) # rename columns as necessary if k != v[0]: df = df.withColumnRenamed(k, v[0]) # We select only the columns identified in json2csv, skipping the user_urls column (which may have been deprecated) csv_columns = [c for c in json2csv.get_headings()] df_csv = df.select(csv_columns) # Remove newlines in the text and user_location fields df_csv = df_csv.withColumn('text', F.regexp_replace('text', '\n|\r', ' ')) df_csv = df_csv.withColumn('user_location', F.regexp_replace('user_location', '\n|\r', ' ')) # Swap back the date fields so that the created_at field contains the unparsed version data_mapping = {'created_at': 'parsed_created_at', 'parsed_created_at': 'created_at'} df_csv = df_csv.select([F.col(c).alias(data_mapping.get(c, c)) for c in df_csv.columns]) # Setting the escape character to the double quote. Otherwise, it causes problems for applications reading the CSV. # Get rid of duplicate tweets df_csv = df_csv.dropDuplicates(['id']) return df_csv
def on_hit(self, hit, tweet_count): # Cycle tweet id files if tweet_count % self.max_per_file == 0: if self.file: self.file.close() self.file = open( os.path.join( self.dataset_path, 'tweets-{}.csv'.format(str(self.file_count).zfill(3))), 'w') self.sheet = csv.writer(self.file) self.sheet.writerow(json2csv.get_headings()) self.file_count += 1 # Write to tweet file self.sheet.writerow(json2csv.get_row(json.loads(hit.tweet), excel=True))
def make_column_mapping(df_columns, array_fields): '''Creates mapping from TweetSets fields to CSV column headings, using headings derived from twarc.json2csv. Each key is a column name in the DataFrame created from Tweet JSON by SQL transform; each value a tuple: the first element is the name of the CSV column heading, the second element is a Boolean flag indicating whether this field is an array. (Arrays need to be transformed to strings prior to writing to CSV.) :param df_columns: list of columns in the transformed Spark DataFrame (includes some fields required by json2csv not indexed in Elasticsearch) :param array_fields: list of fields in df_columns stored as arrays''' # Map TweetSets fields to their CSV column names column_mapping = {'retweet_quoted_status_id': 'retweet_or_quote_id', 'retweeted_quoted_screen_name': 'retweet_or_quote_screen_name', 'tweet_id': 'id', 'user_follower_count': 'user_followers_count', 'language': 'lang', 'retweeted_quoted_user_id': 'retweet_or_quote_user_id', 'hashtags_csv': 'hashtags', 'urls_csv': 'urls' } # Add remaining fields from the DataFrame if they are used by json2csv column_mapping.update({k: k for k in df_columns if k in json2csv.get_headings()}) # Set array flag for those fields that require it column_mapping = {k: (v, True if k in array_fields else False) for k,v in column_mapping.items()} return column_mapping
def main(): parser = get_argparser() args = parser.parse_args() command = args.command query = args.query or "" logging.basicConfig(filename=args.log, level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") # log and stop when process receives SIGINT def stop(signal, frame): log.warn('process received SIGNT, stopping') sys.exit(0) signal.signal(signal.SIGINT, stop) if command == "version": print("twarc v%s" % __version__) sys.exit() elif command == "help" or not command: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) # Don't validate the keys if the command is "configure" if command == "configure" or args.skip_key_validation: validate_keys = False else: validate_keys = True t = Twarc(consumer_key=args.consumer_key, consumer_secret=args.consumer_secret, access_token=args.access_token, access_token_secret=args.access_token_secret, connection_errors=args.connection_errors, http_errors=args.http_errors, config=args.config, profile=args.profile, tweet_mode=args.tweet_mode, protected=args.protected, validate_keys=validate_keys, app_auth=args.app_auth, gnip_auth=args.gnip_auth) # calls that return tweets if command == "search": if len(args.lang) > 0: lang = args.lang[0] else: lang = None # if not using a premium endpoint do a standard search if not args.thirtyday and not args.fullarchive and not args.gnip_fullarchive: things = t.search(query, since_id=args.since_id, max_id=args.max_id, lang=lang, result_type=args.result_type, geocode=args.geocode) else: # parse the dates if given from_date = parse_dt(args.from_date) if args.from_date else None to_date = parse_dt(args.to_date) if args.to_date else None if args.gnip_fullarchive: env = args.gnip_fullarchive product = 'gnip_fullarchive' elif args.thirtyday: env = args.thirtyday product = '30day' else: env = args.fullarchive product = 'fullarchive' things = t.premium_search( query, product, env, from_date=from_date, to_date=to_date, sandbox=args.sandbox, limit=args.limit, ) elif command == "filter": things = t.filter(track=query, follow=args.follow, locations=args.locations, lang=args.lang) elif command == "dehydrate": input_iterator = fileinput.FileInput( query, mode='r', openhook=fileinput.hook_compressed, ) things = t.dehydrate(input_iterator) elif command == "hydrate": input_iterator = fileinput.FileInput( query, mode='r', openhook=fileinput.hook_compressed, ) things = t.hydrate(input_iterator) elif command == "tweet": things = [t.tweet(query)] elif command == "sample": things = t.sample() elif command == "timeline": kwargs = {"max_id": args.max_id, "since_id": args.since_id} if re.match('^[0-9]+$', query): kwargs["user_id"] = query elif query: kwargs["screen_name"] = query things = t.timeline(**kwargs) elif command == "retweets": if os.path.isfile(query): iterator = fileinput.FileInput( query, mode='r', openhook=fileinput.hook_compressed, ) things = t.retweets(tweet_ids=iterator) else: things = t.retweets(tweet_ids=query.split(',')) elif command == "users": if os.path.isfile(query): iterator = fileinput.FileInput( query, mode='r', openhook=fileinput.hook_compressed, ) if re.match('^[0-9,]+$', next(open(query))): id_type = 'user_id' else: id_type = 'screen_name' things = t.user_lookup(ids=iterator, id_type=id_type) elif re.match('^[0-9,]+$', query): things = t.user_lookup(ids=query.split(",")) else: things = t.user_lookup(ids=query.split(","), id_type='screen_name') elif command == "followers": things = t.follower_ids(query) elif command == "friends": things = t.friend_ids(query) elif command == "trends": # lookup woeid for geo-coordinate if appropriate geo = re.match('^([0-9-.]+),([0-9-.]+)$', query) if geo: lat, lon = map(float, geo.groups()) if lat > 180 or lat < -180 or lon > 180 or lon < -180: parser.error('LAT and LONG must be within [-180.0, 180.0]') places = list(t.trends_closest(lat, lon)) if len(places) == 0: parser.error("Couldn't find WOE ID for %s" % query) query = places[0]["woeid"] if not query: things = t.trends_available() else: trends = t.trends_place(query) if trends: things = trends[0]['trends'] elif command == "replies": tweet = t.tweet(query) if not tweet: parser.error("tweet with id %s does not exist" % query) things = t.replies(tweet, args.recursive) elif command == "listmembers": list_parts = re.match('^https://twitter.com/(.+)/lists/(.+)$', query) if not list_parts: parser.error( "provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces" ) things = t.list_members(slug=list_parts.group(2), owner_screen_name=list_parts.groups(1)) elif command == "configure": t.configure() sys.exit() else: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) # get the output filehandle if args.output: if pyv == 3: fh = codecs.open(args.output, 'wb', 'utf8') else: fh = open(args.output, 'w') else: fh = sys.stdout # optionally create a csv writer csv_writer = None if args.format in ("csv", "csv-excel") and command not in [ "filter", "hydrate", "replies", "retweets", "sample", "search", "timeline", "tweet" ]: parser.error("csv output not available for %s" % command) elif args.format in ("csv", "csv-excel"): csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count = 0 file_count = 0 for thing in things: # rotate the files if necessary if args.output and args.split and line_count % args.split == 0: file_count += 1 fh = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf8') if csv_writer: csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count += 1 # ready to output kind_of = type(thing) if kind_of == str_type: # user or tweet IDs print(thing, file=fh) log.info("archived %s" % thing) elif 'id_str' in thing: # tweets and users if (args.format == "json"): print(json.dumps(thing), file=fh) elif (args.format == "csv"): csv_writer.writerow(get_row(thing)) elif (args.format == "csv-excel"): csv_writer.writerow(get_row(thing, excel=True)) log.info("archived %s", thing['id_str']) elif 'woeid' in thing: # places print(json.dumps(thing), file=fh) elif 'tweet_volume' in thing: # trends print(json.dumps(thing), file=fh) elif 'limit' in thing: # rate limits t = datetime.datetime.utcfromtimestamp( float(thing['limit']['timestamp_ms']) / 1000) t = t.isoformat("T") + "Z" log.warning("%s tweets undelivered at %s", thing['limit']['track'], t) if args.warnings: print(json.dumps(thing), file=fh) elif 'warning' in thing: # other warnings log.warning(thing['warning']['message']) if args.warnings: print(json.dumps(thing), file=fh) elif 'data' in thing: # Labs style JSON schema. print(json.dumps(thing), file=fh)
def main(): parser = get_argparser() args = parser.parse_args() command = args.command query = args.query or "" logging.basicConfig( filename=args.log, level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s" ) # catch ctrl-c so users don't see a stack trace signal.signal(signal.SIGINT, lambda signal, frame: sys.exit(0)) if command == "version": print("twarc v%s" % __version__) sys.exit() elif command == "help" or not command: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) # Don't validate the keys if the command is "configure" if command == "configure" or args.skip_key_validation: validate_keys = False else: validate_keys = True t = Twarc( consumer_key=args.consumer_key, consumer_secret=args.consumer_secret, access_token=args.access_token, access_token_secret=args.access_token_secret, connection_errors=args.connection_errors, http_errors=args.http_errors, config=args.config, profile=args.profile, tweet_mode=args.tweet_mode, protected=args.protected, validate_keys=validate_keys, ) # calls that return tweets if command == "search": things = t.search( query, since_id=args.since_id, max_id=args.max_id, lang=args.lang, result_type=args.result_type, geocode=args.geocode ) elif command == "filter": things = t.filter( track=query, follow=args.follow, locations=args.locations ) elif command == "dehydrate": input_iterator = fileinput.FileInput( query, mode='r', openhook=fileinput.hook_compressed, ) things = t.dehydrate(input_iterator) elif command == "hydrate": input_iterator = fileinput.FileInput( query, mode='r', openhook=fileinput.hook_compressed, ) things = t.hydrate(input_iterator) elif command == "tweet": things = [t.tweet(query)] elif command == "sample": things = t.sample() elif command == "timeline": kwargs = {"max_id": args.max_id, "since_id": args.since_id} if re.match('^[0-9]+$', query): kwargs["user_id"] = query else: kwargs["screen_name"] = query things = t.timeline(**kwargs) elif command == "retweets": things = t.retweets(query) elif command == "users": if os.path.isfile(query): iterator = fileinput.FileInput( query, mode='r', openhook=fileinput.hook_compressed, ) if re.match('^[0-9,]+$', next(open(query))): id_type = 'user_id' else: id_type = 'screen_name' things = t.user_lookup(ids=iterator, id_type=id_type) elif re.match('^[0-9,]+$', query): things = t.user_lookup(ids=query.split(",")) else: things = t.user_lookup(ids=query.split(","), id_type='screen_name') elif command == "followers": things = t.follower_ids(query) elif command == "friends": things = t.friend_ids(query) elif command == "trends": # lookup woeid for geo-coordinate if appropriate geo = re.match('^([0-9\-\.]+),([0-9\-\.]+)$', query) if geo: lat, lon = map(float, geo.groups()) if lat > 180 or lat < -180 or lon > 180 or lon < -180: parser.error('LAT and LONG must be within [-180.0, 180.0]') places = list(t.trends_closest(lat, lon)) if len(places) == 0: parser.error("Couldn't find WOE ID for %s" % query) query = places[0]["woeid"] if not query: things = t.trends_available() else: trends = t.trends_place(query) if trends: things = trends[0]['trends'] elif command == "replies": tweet = t.tweet(query) if not tweet: parser.error("tweet with id %s does not exist" % query) things = t.replies(tweet, args.recursive) elif command == "listmembers": list_parts = re.match('^https://twitter.com/(.+)/lists/(.+)$', query) if not list_parts: parser.error("provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces") things = t.list_members(slug=list_parts.group(2), owner_screen_name=list_parts.groups(1)) elif command == "configure": t.configure() sys.exit() else: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) # get the output filehandle if args.output: if pyv == 3: fh = codecs.open(args.output, 'wb', 'utf8') else: fh = open(args.output, 'w') else: fh = sys.stdout # optionally create a csv writer csv_writer = None if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies", "retweets", "sample", "search", "timeline", "tweet"]: parser.error("csv output not available for %s" % command) elif args.format in ("csv", "csv-excel"): csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count = 0 file_count = 0 for thing in things: # rotate the files if necessary if args.output and args.split and line_count % args.split == 0: file_count += 1 fh = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf8') if csv_writer: csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count += 1 # ready to output kind_of = type(thing) if kind_of == str_type: # user or tweet IDs print(thing, file=fh) logging.info("archived %s" % thing) elif 'id_str' in thing: # tweets and users if (args.format == "json"): print(json.dumps(thing), file=fh) elif (args.format == "csv"): csv_writer.writerow(get_row(thing)) elif (args.format == "csv-excel"): csv_writer.writerow(get_row(thing, excel=True)) logging.info("archived %s", thing['id_str']) elif 'woeid' in thing: # places print(json.dumps(thing), file=fh) elif 'tweet_volume' in thing: # trends print(json.dumps(thing), file=fh) elif 'limit' in thing: # rate limits t = datetime.datetime.utcfromtimestamp( float(thing['limit']['timestamp_ms']) / 1000) t = t.isoformat("T") + "Z" logging.warn("%s tweets undelivered at %s", thing['limit']['track'], t) if args.warnings: print(json.dumps(thing), file=fh) elif 'warning' in thing: # other warnings logging.warn(thing['warning']['message']) if args.warnings: print(json.dumps(thing), file=fh)
def get_headings(extra_headings=None): fields = json2csv.get_headings() if extra_headings: fields.extend(extra_headings) return fields
def main(): parser = get_argparser() args = parser.parse_args() command = args.command query = args.query or "" logging.basicConfig(filename=args.log, level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") if command == "version": print("twarc v%s" % __version__) sys.exit() elif command == "help" or not command: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) t = Twarc(consumer_key=args.consumer_key, consumer_secret=args.consumer_secret, access_token=args.access_token, access_token_secret=args.access_token_secret, connection_errors=args.connection_errors, http_errors=args.http_errors, config=args.config, profile=args.profile, tweet_mode=args.tweet_mode) # calls that return tweets if command == "search": things = t.search(query, since_id=args.since_id, max_id=args.max_id, lang=args.lang, result_type=args.result_type, geocode=args.geocode) elif command == "filter": things = t.filter(track=query, follow=args.follow, locations=args.locations) elif command == "dehydrate": input_iterator = fileinput.FileInput( query, mode='rU', openhook=fileinput.hook_compressed, ) things = t.dehydrate(input_iterator) elif command == "hydrate": input_iterator = fileinput.FileInput( query, mode='rU', openhook=fileinput.hook_compressed, ) things = t.hydrate(input_iterator) elif command == "tweet": things = [t.tweet(query)] elif command == "sample": things = t.sample() elif command == "timeline": kwargs = {"max_id": args.max_id, "since_id": args.since_id} if re.match('^[0-9]+$', query): kwargs["user_id"] = query else: kwargs["screen_name"] = query things = t.timeline(**kwargs) elif command == "retweets": things = t.retweets(query) elif command == "users": if os.path.isfile(query): iterator = fileinput.FileInput( query, mode='rU', openhook=fileinput.hook_compressed, ) things = t.user_lookup(iterator=iterator) elif re.match('^[0-9,]+$', query): things = t.user_lookup(user_ids=query.split(",")) else: things = t.user_lookup(screen_names=query.split(",")) elif command == "followers": things = t.follower_ids(query) elif command == "friends": things = t.friend_ids(query) elif command == "trends": # lookup woeid for geo-coordinate if appropriate geo = re.match('^([0-9\-\.]+),([0-9\-\.]+)$', query) if geo: lat, lon = map(float, geo.groups()) if lat > 180 or lat < -180 or lon > 180 or lon < -180: parser.error('LAT and LONG must be within [-180.0, 180.0]') places = list(t.trends_closest(lat, lon)) if len(places) == 0: parser.error("Couldn't find WOE ID for %s" % query) query = places[0]["woeid"] if not query: things = t.trends_available() else: trends = t.trends_place(query) if trends: things = trends[0]['trends'] elif command == "replies": tweet = t.tweet(query) if not tweet: parser.error("tweet with id %s does not exist" % query) things = t.replies(tweet, args.recursive) elif command == "configure": t.input_keys() sys.exit() else: parser.print_help() print("\nPlease use one of the following commands:\n") for cmd in commands: print(" - %s" % cmd) print("\nFor example:\n\n twarc search blacklivesmatter") sys.exit(1) # get the output filehandle if args.output: fh = codecs.open(args.output, 'wb', 'utf8') else: fh = sys.stdout # optionally create a csv writer csv_writer = None if args.format == "csv" and command not in [ "filter", "hydrate", "replies", "retweets", "sample", "search", "timeline", "tweet" ]: parser.error("csv output not available for %s" % command) elif args.format == "csv": csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count = 0 file_count = 0 for thing in things: # rotate the files if necessary if args.output and args.split and line_count % args.split == 0: file_count += 1 fh = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf8') if csv_writer: csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) line_count += 1 # ready to output kind_of = type(thing) if kind_of == str_type: # user or tweet IDs print(thing, file=fh) logging.info("archived %s" % thing) elif 'id_str' in thing: # tweets and users if (args.format == "json"): print(json.dumps(thing), file=fh) elif (args.format == "csv"): csv_writer.writerow(get_row(thing)) logging.info("archived %s", thing['id_str']) elif 'woeid' in thing: # places print(json.dump(thing), file=fh) elif 'tweet_volume' in thing: # trends print(json.dump(thing), file=fh) elif 'limit' in thing: # rate limits t = datetime.datetime.utcfromtimestamp( float(thing['limit']['timestamp_ms']) / 1000) t = t.isoformat("T") + "Z" logging.warn("%s tweets undelivered at %s", thing['limit']['track'], t) if args.warnings: print(json.dump(thing), file=fh) elif 'warning' in thing: # other warnings logging.warn(thing['warning']['message']) if args.warnings: print(json.dump(thing), file=fh)
def main(warc_file): twitter = Twarc() out = csv.writer(sys.stdout) out.writerow(json2csv.get_headings()) for tweet in twitter.hydrate(tweet_ids(warc_file)): out.writerow(json2csv.get_row(tweet))