Ejemplo n.º 1
0
    cursor.execute("SELECT COUNT(*) FROM {}".format(annotations_table))
    return cursor.fetchone()[0]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Tweet annotator')
    parser.add_argument('tweet_file', help='JSON tweets file for annotation')
    parser.add_argument('keyword', help='Keyword we wish to disambiguate (determines table name and used to filter tweets)')
    parser.add_argument('--skipto', default=None, type=int, help="Skip forwards to this tweet id, continue from the next tweet")
    args = parser.parse_args()
    print("These are our args:")
    print(args)
    print(args.tweet_file, args.keyword)

    annotations_table, spotlight_table = sql_convenience.create_all_tables(args.keyword)
    tweets = tweet_generators.get_tweets(open(args.tweet_file))

    # we can skip through Tweets we've already seen in the same file by
    # specifying a tweet id to jump to
    if args.skipto is not None:
        for tweet in tweets:
            if tweet['id'] == args.skipto:
                break  # continue after this tweet

    for tweet in tweets:
        tweet_text = unicode(tweet['text'])
        annotate = True
        # determine if this is an English tweet or not
        tweet_text_bytesutf8 = tweet_text.encode('utf-8')
        language_name, language_code, is_reliable, text_bytes_found, details = cld.detect(tweet_text_bytesutf8)
        # example: ('SPANISH', 'es', True, 69, [('SPANISH', 'es', 100, 93.45794392523365)])
    filter_to = datetime.datetime.now()
    filter_to = filter_to.replace(tzinfo=pytz.utc)
    filter_to_str = filter_to.isoformat()
    #filter_to_str = time.strftime("%Y-%m-%dT%H:%M", filter_to.timetuple())
    parser = argparse.ArgumentParser(description='Extract information from streaming tweet set')
    parser.add_argument('--json-raw', nargs="*", help='Input to analyse e.g. tweets.json')
    parser.add_argument('--ff', type=str, default=filter_from_str, help="Filter From date range, defaults to '--ff %s'" % (filter_from_str))
    parser.add_argument('--ft', type=str, default=filter_to_str, help="Filter To date range, defaults to '--ff %s'" % (filter_to_str))
    parser.add_argument('--text-file', help="Filename for just the tweet update text, one per line e.g. '--updates-file tweetsonly.txt'")
    parser.add_argument('--output', "-o", help="Output to write e.g. -o coords.txt")
    args = parser.parse_args()

    print args.json_raw

    all_json_lines = tweet_generators.files(args.json_raw)
    tweets = tweet_generators.get_tweets(all_json_lines)
    stream = tweet_generators.get_tweet_body(tweets)

    # default will be to look at the last 30 days only
    if args.ff:
        filter_from = dt_parser.parse(args.ff)
        filter_from = filter_from.replace(tzinfo=pytz.utc)
    if args.ft:
        filter_to = dt_parser.parse(args.ft)
        filter_to = filter_to.replace(tzinfo=pytz.utc)
    print("Filtering from {} to {}".format(filter_from, filter_to))

    filter_until_partial = functools.partial(filter_until, filter_from=filter_from, filter_to=filter_to)
    stream = filter_until_partial(stream)

    if args.output:
Ejemplo n.º 3
0
                        help="Filter To date range, defaults to '--ff %s'" %
                        (filter_to_str))
    parser.add_argument(
        '--text-file',
        help=
        "Filename for just the tweet update text, one per line e.g. '--updates-file tweetsonly.txt'"
    )
    parser.add_argument('--output',
                        "-o",
                        help="Output to write e.g. -o coords.txt")
    args = parser.parse_args()

    print args.json_raw

    all_json_lines = tweet_generators.files(args.json_raw)
    tweets = tweet_generators.get_tweets(all_json_lines)
    stream = tweet_generators.get_tweet_body(tweets)

    # default will be to look at the last 30 days only
    if args.ff:
        filter_from = dt_parser.parse(args.ff)
        filter_from = filter_from.replace(tzinfo=pytz.utc)
    if args.ft:
        filter_to = dt_parser.parse(args.ft)
        filter_to = filter_to.replace(tzinfo=pytz.utc)
    print("Filtering from {} to {}".format(filter_from, filter_to))

    filter_until_partial = functools.partial(filter_until,
                                             filter_from=filter_from,
                                             filter_to=filter_to)
    stream = filter_until_partial(stream)