def do_dependency_parse(fil):
    u = TwitterUser()
    u.populate_tweets_from_file(fil,do_tokenize=False)
    out_file_name = fil.replace(".json","").replace(".gz","").replace("/json/","/dep_parse/")
    print out_file_name

    if len(u.tweets) == 0:
        os.utime(out_file_name)
        return 'empty, success'

    data = dependency_parse_tweets(TWEEBOPARSER_LOC,u.tweets,out_file_name)
    return 'completed'
Exemple #2
0
def do_dependency_parse(fil):
    u = TwitterUser()
    u.populate_tweets_from_file(fil, do_tokenize=False)
    out_file_name = fil.replace(".json", "").replace(".gz", "").replace(
        "/json/", "/dep_parse/")
    print out_file_name

    if len(u.tweets) == 0:
        os.utime(out_file_name)
        return 'empty, success'

    data = dependency_parse_tweets(TWEEBOPARSER_LOC, u.tweets, out_file_name)
    return 'completed'
def return_users_from_json_file(file_name,
                                user_id_field='id',
                                only_english=False,
                                min_tweet_count_for_user=5,
                                verbose=True,
                                stopwords=None,
                                return_tweet_json=False):

    if file_name.endswith(".gz"):
        reader = [
            z.decode("utf8") for z in gzip.open(file_name).read().splitlines()
        ]
    else:
        reader = codecs.open(file_name, "r", "utf8")

    users = defaultdict(list)

    n_tweets = 0
    n_non_english = 0

    for line in reader:
        n_tweets += 1

        try:
            tweet = json.loads(line)
        except:
            print 'failed tweet'
            pass
        lang = tweet['lang'] if 'lang' in tweet else langid.classify(
            tweet['text'])[0]
        if not only_english or (only_english and lang == 'en'):
            # ignore the old tweets for now
            users[tweet['user'][user_id_field]].append(tweet)
        else:
            n_non_english += 1

    if not file_name.endswith(".gz"):
        reader.close()

    n_tweets = float(n_tweets)

    if n_tweets == 0 or (only_english and (n_tweets - n_non_english) == 0):
        return []

    good_users = [
        u for u in users.itervalues() if len(u) >= min_tweet_count_for_user
    ]
    twitter_users = [
        TwitterUser(list_of_tweets=u, stopwords=stopwords) for u in good_users
    ]

    if verbose:
        print '\tPercent non english tweets ignored:\t{:0.2f}'.format(
            n_non_english / n_tweets)
        print '\tNum used tweets:\t{0}'.format(n_tweets - n_non_english)
        print '\tN users pre min selection:\t', len(users)

        print '\tN users post min selection:\t', len(twitter_users)
        n_tweets_per_user = [len(u) for u in users.itervalues()]
        #print 'Tweet stats...min: %d max: %d median: %d mean: %d sd: %d' % \
        #(np.min(n_tweets_per_user),
        # np.max(n_tweets_per_user),
        # np.median(n_tweets_per_user),
        # np.mean(n_tweets_per_user),
        # np.std(n_tweets_per_user))

    if not return_tweet_json:
        return twitter_users

    tweet_dict = {}
    if return_tweet_json:
        for u in good_users:
            for t in u:
                tweet_dict[t['id']] = t
    return twitter_users, tweet_dict
Exemple #4
0
print 'n authed users: ', len(handles)

out_dir = sys.argv[2]

user_ids = [line.strip().split(",")[0] for line in open(sys.argv[3]).readlines()]

print 'num users: ', len(user_ids)

net_out = codecs.open("mention_net.csv","w","utf8")
net_out.write("sender,mentioned,date\n")
ht_out = codecs.open("ht_net.csv","w","utf8")
ht_out.write("user,hashtag,date\n")
att_out = codecs.open("att_net.csv","w","utf8")
att_out.write("user,user_name,times_listed,n_followers,n_following\n")
for i in range(len(user_ids)):
    user = TwitterUser(handles[i], screen_name=user_ids[i])
    user.populate_tweets_from_api(sleep_var=False)
    print user.screen_name
    for t in user.tweets:
        datetime = t.created_at.strftime("%Y-%m-%d")
        for m in t.mentions_sns:
            net_out.write(",".join([user.screen_name, m, datetime])+"\n")
        for h in t.hashtags:
            ht_out.write(",".join([user.screen_name,h,datetime])+"\n")
    try:
        att_out.write(",".join([user.screen_name,user.name,str(user.times_listed),str(user.followers_count),str(user.following_count)])+"\n")
    except:
        pass
net_out.close()
ht_out.close()
att_out.close()
"""
A simple example of how to use a single api hook to get tweets for a particular user
"""

import codecs
import sys

from twitter_dm import TwitterApplicationHandler
from twitter_dm import TwitterUser

if len(sys.argv) != 4:
    print 'usage:  [login_credentials_file] [user_screen_name] [output_file]'
    sys.exit(-1)

##get all the handles we have to the api
app_handler = TwitterApplicationHandler(pathToConfigFile=sys.argv[1])
print 'n authed users: ', len(app_handler.api_hooks)

user = TwitterUser(app_handler.api_hooks[0], screen_name=sys.argv[2])

print('\tgetting tweets for: ', sys.argv[2])
user.populate_tweets_from_api(sleep_var=False)

if len(user.tweets) > 0:
    out_fil = codecs.open(sys.argv[3], "w", "utf8")
    for tweet in user.tweets:
        out_fil.write(tweet.text.replace("\n", "   ") + "\n")

out_fil.close()
"""
A simple example of how to use a single api hook to get tweets for a particular user
"""

import sys,codecs
from twitter_dm import TwitterUser
from twitter_dm import TwitterApplicationHandler

if len(sys.argv) != 4:
    print 'usage:  [login_credentials_file] [user_screen_name] [output_file]'
    sys.exit(-1)

##get all the handles we have to the api
app_handler = TwitterApplicationHandler(pathToConfigFile=sys.argv[1])
print 'n authed users: ', len(app_handler.api_hooks)

user = TwitterUser(app_handler.api_hooks[0], screen_name=sys.argv[2])

print('\tgetting tweets for: ',  sys.argv[2])
user.populate_tweets_from_api(sleep_var=False)

if len(user.tweets) > 0:
    out_fil = codecs.open(sys.argv[3], "w","utf8")
    for tweet in user.tweets:
        out_fil.write(tweet.text.replace("\n","   ")+"\n")

out_fil.close()