cursor.execute("SELECT COUNT(*) FROM {}".format(annotations_table)) return cursor.fetchone()[0] if __name__ == "__main__": parser = argparse.ArgumentParser(description='Tweet annotator') parser.add_argument('tweet_file', help='JSON tweets file for annotation') parser.add_argument('keyword', help='Keyword we wish to disambiguate (determines table name and used to filter tweets)') parser.add_argument('--skipto', default=None, type=int, help="Skip forwards to this tweet id, continue from the next tweet") args = parser.parse_args() print("These are our args:") print(args) print(args.tweet_file, args.keyword) annotations_table, spotlight_table = sql_convenience.create_all_tables(args.keyword) tweets = tweet_generators.get_tweets(open(args.tweet_file)) # we can skip through Tweets we've already seen in the same file by # specifying a tweet id to jump to if args.skipto is not None: for tweet in tweets: if tweet['id'] == args.skipto: break # continue after this tweet for tweet in tweets: tweet_text = unicode(tweet['text']) annotate = True # determine if this is an English tweet or not tweet_text_bytesutf8 = tweet_text.encode('utf-8') language_name, language_code, is_reliable, text_bytes_found, details = cld.detect(tweet_text_bytesutf8) # example: ('SPANISH', 'es', True, 69, [('SPANISH', 'es', 100, 93.45794392523365)])
filter_to = datetime.datetime.now() filter_to = filter_to.replace(tzinfo=pytz.utc) filter_to_str = filter_to.isoformat() #filter_to_str = time.strftime("%Y-%m-%dT%H:%M", filter_to.timetuple()) parser = argparse.ArgumentParser(description='Extract information from streaming tweet set') parser.add_argument('--json-raw', nargs="*", help='Input to analyse e.g. tweets.json') parser.add_argument('--ff', type=str, default=filter_from_str, help="Filter From date range, defaults to '--ff %s'" % (filter_from_str)) parser.add_argument('--ft', type=str, default=filter_to_str, help="Filter To date range, defaults to '--ff %s'" % (filter_to_str)) parser.add_argument('--text-file', help="Filename for just the tweet update text, one per line e.g. '--updates-file tweetsonly.txt'") parser.add_argument('--output', "-o", help="Output to write e.g. -o coords.txt") args = parser.parse_args() print args.json_raw all_json_lines = tweet_generators.files(args.json_raw) tweets = tweet_generators.get_tweets(all_json_lines) stream = tweet_generators.get_tweet_body(tweets) # default will be to look at the last 30 days only if args.ff: filter_from = dt_parser.parse(args.ff) filter_from = filter_from.replace(tzinfo=pytz.utc) if args.ft: filter_to = dt_parser.parse(args.ft) filter_to = filter_to.replace(tzinfo=pytz.utc) print("Filtering from {} to {}".format(filter_from, filter_to)) filter_until_partial = functools.partial(filter_until, filter_from=filter_from, filter_to=filter_to) stream = filter_until_partial(stream) if args.output:
help="Filter To date range, defaults to '--ff %s'" % (filter_to_str)) parser.add_argument( '--text-file', help= "Filename for just the tweet update text, one per line e.g. '--updates-file tweetsonly.txt'" ) parser.add_argument('--output', "-o", help="Output to write e.g. -o coords.txt") args = parser.parse_args() print args.json_raw all_json_lines = tweet_generators.files(args.json_raw) tweets = tweet_generators.get_tweets(all_json_lines) stream = tweet_generators.get_tweet_body(tweets) # default will be to look at the last 30 days only if args.ff: filter_from = dt_parser.parse(args.ff) filter_from = filter_from.replace(tzinfo=pytz.utc) if args.ft: filter_to = dt_parser.parse(args.ft) filter_to = filter_to.replace(tzinfo=pytz.utc) print("Filtering from {} to {}".format(filter_from, filter_to)) filter_until_partial = functools.partial(filter_until, filter_from=filter_from, filter_to=filter_to) stream = filter_until_partial(stream)