def gen_json_for_tweets_of_interest(data, identity_list):
    of_id, uid_list = data
    json_of_name = os.path.join(JSON_OUTPUT_DIRECTORY, str(of_id) + ".json.gz")

    print 'inp: ', json_of_name, len(uid_list), uid_list[0:2]
    tweets_to_write = []

    if not os.path.exists(json_of_name):
        for i, uid in enumerate(uid_list):
            if i % 25 == 0:
                print i, len(tweets_to_write)
            try:
                u = TwitterUser()
                u.populate_tweets_from_file(os.path.join(
                    JSON_INPUT_DIRECTORY, uid + ".json.gz"),
                                            store_json=True)
                tweets_to_keep = []
                for t in u.tweets:
                    if not t.retweeted and len(t.tokens) > 4:
                        expanded_token_set = copy(t.tokens)
                        for token in t.tokens:
                            expanded_token_set += get_alternate_wordforms(
                                token)
                        if len(set(expanded_token_set) & identity_list):
                            tweets_to_keep.append(t)
                tweets_to_write += tweets_to_keep
            except:
                print 'FAILED JSON FOR USER: '******'WRITING JSON'
        out_fil = gzip.open(json_of_name, "wb")
        for tweet in tweets_to_write:
            out_fil.write(
                json.dumps(tweet.raw_json).strip().encode("utf8") + "\n")
        out_fil.close()
def gen_conll_file(fil,ptb_dir, dp_dir):
    user = TwitterUser()
    user.populate_tweets_from_file(fil, do_tokenize=False)

    if 50 <= user.n_total_tweets <= 15000 and\
       user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        dp_filename = os.path.join(dp_dir,str(user.user_id)+".gz")
        ptb_filename = os.path.join(ptb_dir,str(user.user_id)+".txt.gz")

        if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename):
            return ['no_dp_ptb',[user.user_id,os.path.exists(dp_filename),os.path.exists(ptb_filename)]]

        penntreebank = {x[0] : x[1:] for x in read_grouped_by_newline_file(ptb_filename)}
        dependency_parse =  read_grouped_by_newline_file(dp_filename)

        tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\
                       len(t.urls) == 0 and 'http:' not in t.text and\
                       langid.classify(t.text)[0] == 'en']

        # non english speaker or spam
        if len(tweet_set) < 40:
            return ['notweets',user.user_id]


        data_to_return = []
        for twit_it, tweet in tweet_set:

            data_for_tweet = []

            ptb_for_tweet = penntreebank[str(tweet.id)]
            dp_for_tweet = dependency_parse[twit_it]

            if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(dp_for_tweet[0]).text:
                print 'ahhhhh, weird stuff'
                continue

            for i, p in enumerate(dp_for_tweet):
                d = DependencyParseObject(tsn([p,tweet.id,user.user_id,tweet.created_at.strftime("%m-%d-%y")],newline=False))
                # get java features
                spl_java = ptb_for_tweet[i].split("\t")
                java_id, penn_pos_tag,word = spl_java[:3]
                java_features = '' if len(spl_java) == 3 else spl_java[3]
                d.features += [x for x in java_features.split("|") if x != '']
                d.features.append("penn_treebank_pos="+penn_pos_tag)
                data_for_tweet.append(d)
            data_to_return.append(data_for_tweet)

        return ['success', [user.user_id,data_to_return]]
    else:
        return ['baduser',user.user_id]
def gen_output(data, json_data_dir):

    term, is_reply, tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir, "*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,
                                       store_json=True,
                                       do_arabic_stemming=False,
                                       lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset, open(name + ".p", 'wb'))
                print 'done with: ', name, is_reply
                return

        else:
            print 'failed user'
Esempio n. 4
0
def gen_json_for_tweets_of_interest(input_filename,
                                    output_filename,
                                    keep_only_tweets_with_terms=None):
    """
    This function generates a cleaned json file so that the identity
    extraction only happens on "interesting" tweets.  Right now,
    interesting is defined as non-retweets that have >4 tokens. Feel free to redefine
    as you feel is suitable

    :param input_filename: input json file name (Can be gzipped)
    :param output_filename: cleaned output json filename
    :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific
            set of terms, you can use this argument and pass in a set of terms here
    :return:
    """
    tweets_to_write = []

    if not os.path.exists(output_filename):
        try:
            u = TwitterUser()
            u.populate_tweets_from_file(input_filename, store_json=True)
            tweets = [
                t for t in u.tweets if not t.retweeted and len(t.tokens) > 4
            ]
            tweets_to_keep = []

            if keep_only_tweets_with_terms:
                expanded_token_set = copy(t.tokens)
                for t in tweets:
                    for token in t.tokens:
                        expanded_token_set += get_alternate_wordforms(token)
                    if len(
                            set(expanded_token_set)
                            & keep_only_tweets_with_terms):
                        tweets_to_keep.append(t)
            else:
                tweets_to_keep = tweets

            tweets_to_write += tweets_to_keep
        except:
            print 'FAILED TO PARSE JSON FILE: ', input_filename

        print 'WRITING JSON'
        out_fil = gzip.open(output_filename, "wb")
        for tweet in tweets_to_write:
            out_fil.write(
                json.dumps(tweet.raw_json).strip().encode("utf8") + "\n")
        out_fil.close()
def gen_output(data, json_data_dir):

    term,is_reply,tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir,"*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset,open(name+".p",'wb'))
                print 'done with: ',name, is_reply
                return

        else:
            print 'failed user'
    def get_user_network(self, this_user_network_dir_name, user_ids, restrict_output_to_ids, stored_user_list):
        counter_val = 0
        for uid in user_ids:
            counter_val += 1
            if counter_val % 10 == 0:
                print (counter_val, " / ", len(user_ids), this_user_network_dir_name.replace(self.network_dir, ""))

            # try to find user in stored_users
            if str(uid) in stored_user_list:
                user = pickle.load(open(self.pickle_dir + "/" + str(uid), "rb"))
            else:
                user = TwitterUser(self.api_hook, user_id=uid)
                user.populate_tweets_from_api()
                out_fil = open(self.pickle_dir + "/" + str(uid), "wb")
                pickle.dump(user, out_fil)
                out_fil.close()

            self.write_user_network(this_user_network_dir_name, user, uid, restrict_output_to_ids)
def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None):
    """
    This function generates a cleaned json file so that the identity
    extraction only happens on "interesting" tweets.  Right now,
    interesting is defined as non-retweets that have >4 tokens. Feel free to redefine
    as you feel is suitable

    :param input_filename: input json file name (Can be gzipped)
    :param output_filename: cleaned output json filename
    :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific
            set of terms, you can use this argument and pass in a set of terms here
    :return:
    """
    tweets_to_write = []

    if not os.path.exists(output_filename):
        try:
            u = TwitterUser()
            u.populate_tweets_from_file(input_filename,store_json=True)
            tweets = [t for t in u.tweets if not t.retweeted and len(t.tokens) > 4]
            tweets_to_keep = []

            if keep_only_tweets_with_terms:
                expanded_token_set = copy(t.tokens)
                for t in tweets:
                    for token in t.tokens:
                        expanded_token_set += get_alternate_wordforms(token)
                    if len(set(expanded_token_set) & keep_only_tweets_with_terms):
                        tweets_to_keep.append(t)
            else:
                tweets_to_keep = tweets


            tweets_to_write += tweets_to_keep
        except:
            print 'FAILED TO PARSE JSON FILE: ', input_filename

        print 'WRITING JSON'
        out_fil = gzip.open(output_filename, "wb")
        for tweet in tweets_to_write:
            out_fil.write(json.dumps(tweet.raw_json).strip().encode("utf8") + "\n")
        out_fil.close()
    def run(self):
        print ("Worker started")

        while True:

            user_id, snow_sample_number = self.queue.get(True)

            print ("Starting: ", user_id, snow_sample_number)

            stored_user_list = set([os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")])

            # Get the ego
            if user_id in stored_user_list:
                print ("\tgot pickled: ", user_id)
                user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb"))
            else:
                user = TwitterUser(self.api_hook, user_id=user_id)
                print ("\tgetting tweets for: ", user_id)
                user.populate_tweets_from_api()
                print ("\t num tweets received for: ", user_id, " ", len(user.tweets))
                # print '\tgetting followers for: ', screen_name
                # user.populate_followers()

                print ("\tgetting friends for: ", user_id)
                user.populate_friends()

                print ("pickling: ", user_id)
                pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb"))

            ##write out their following network and add each id to queue
            network_fil = codecs.open(os.path.join(self.network_dir, user_id), "w", "utf-8")
            added = 0
            for following_id in user.friend_ids:
                if snow_sample_number < 2:
                    added += 1
                    self.queue.put([str(following_id), snow_sample_number + 1])
                network_fil.write(",".join([user_id, str(following_id)]) + "\n")
            network_fil.close()

            print "finished collecting data for: ", user_id
            print "added: ", added
Esempio n. 9
0
def getTweets(twitterid):
    '''
    Function to get the twitter data for an individual twitter ID.
    This function is written to work with Kenny's github example here: https://github.com/kennyjoseph/twitter_dm
    
    Input: string of twitterID
    Output: list of the raw string of all tweets for twitterID
    '''
    from twitter_dm.TwitterUser import TwitterUser

    tweets = []
    u = TwitterUser()
    u.populate_tweets_from_file(
        twitterid + '.json'
    )  #Need to figure out of we can use numeric ID (123456789.json) or name (kenny_joseph.json)

    for t in u.tweets:
        tweets.append(
            t.tokens
        )  #not sure if tokens is exactly what we want, we want the raw words, not necessarily tokens. We'll check this.
    #
    # texts={}
    # source_filename='Datasets/Twitter/members.zip'
    # parser = etree.XMLParser(encoding='utf8',recover=True)
    # with zipfile.ZipFile(source_filename) as zf:
    #     for i,member in enumerate(zf.infolist()):
    #         name=member.filename.split('/')[1].split('.')[0]    #filename is Raw3/name.csv
    #         if idx ==name:
    #             #print idx, name
    #             raw=zf.open(member)
    #             data=csv.reader(raw)
    #             for j,line in enumerate(data):
    #                 if j>0:
    #                     texts[idx+'_'+str(j)]=line[0]
    # if texts=={}:
    #     print 'no tweets for ', idx

    return tweets
    def run(self):
        print ("Worker started")

        while True:

            user_id, snow_sample_number = self.queue.get(True)

            print "Starting: ", user_id, snow_sample_number

            stored_user_list = set(
                [os.path.basename(user_pickle) for user_pickle in glob.glob(os.path.join(self.out_dir, "obj", "*"))]
            )

            # Get the ego
            if user_id in stored_user_list:
                print ("\tgot pickled: ", user_id)
                user = pickle.load(open(os.path.join(self.out_dir, "obj", str(user_id)), "rb"))
            else:
                user = TwitterUser(self.api_hook, user_id=user_id)
                user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir, "json"))

                if len(user.tweets) == 0:
                    print "pickling and dumping: ", user.screen_name
                    pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb"))
                    continue
                print "populating friends, ", user.screen_name
                user.populate_friends()

                print "pickling and dumping (no tweets): ", user.screen_name
                user.tweets = []
                pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb"))

            ##write out their following network and add each id to queue
            # network_fil = codecs.open(os.path.join(self.network_dir,user_id),"w", "utf-8")
            added = 0
            for following_id in user.mentioned.keys():
                if snow_sample_number < self.step_count:
                    added += 1
                    self.queue.put([str(following_id), snow_sample_number + 1])
                # network_fil.write(",".join([user_id,str(following_id)])+"\n")
            # network_fil.close()

            print "finished collecting data for: ", user_id
            print "added: ", added
Esempio n. 11
0
# Get the handles to the Twitter API
handles = get_handles(glob.glob(os.path.join(sys.argv[1], "*.txt")))
print 'n authed users: ', len(handles)

out_dir = sys.argv[2]
os.mkdir(out_dir)

#user_sns = [line.strip() for line in open(sys.argv[3]).readlines()]
user_sns = ['Neuro_Skeptic']

print 'num users: ', len(user_sns)

of = codecs.open("output_fil.tsv", "w", "utf8")
for i in range(len(user_sns)):
    #creates a Twitter User object to fill with information from the API
    user = TwitterUser(handles[i], screen_name=user_sns[i])
    user.populate_tweets_from_api(json_output_filename=out_dir + user_sns[i] +
                                  ".json",
                                  sleep_var=False)
    user.populate_followers()
    rts = 0
    gt = 0
    for t in user.tweets:
        if t.retweeted is not None:
            rts += 1
        if t.geocode_info is not None:
            gt += 1

    of.write(
        tab_stringify_newline([
            user.screen_name, gt, rts,
    spl = dataset_descrip.split("=")

    if spl[0] == 'random':
        datasets_to_collect.append(['', int(spl[1]), []])
    else:
        datasets_to_collect.append([spl[0], int(spl[1]), []])

# get all user files
files = glob.glob(os.path.join(json_data_dir, "*"))

curr_dataset = datasets_to_collect[0]

print datasets_to_collect

for f in files:
    user = TwitterUser(filename_for_tweets=f)

    if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\
        user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        tweet_set = [t for t in user.tweets if t.retweeted is None and\
                                                len(t.urls) == 0 and\
                                                len(t.tokens) > 5 and\
                                                t.created_at <= MIN_TWEET_DATE and\
                                                curr_dataset[0] in t.tokens and\
                                                langid.classify(t.text)[0] == 'en'and\
                                                sentiment(t.text)['compound'] != 0]
        if len(tweet_set) == 0:
            continue

        tweet = random.sample(tweet_set, 1)[0]
Esempio n. 13
0
from twitter_dm.TwitterAPIHook import TwitterAPIHook
from twitter_dm.TwitterUser import TwitterUser

username_to_collect_data_for = 'Jackie_Pooo'

consumer_key = "YOUR_CONSUMER_KEY_HERE"
consumer_secret = "YOUR_CONSUMER_SECRET_HERE"
access_token = "YOUR_ACCESS_TOKEN_HERE"
access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE"

## get a "hook", or connection, to the API using your consumer key/secret and access token/secret
api_hook = TwitterAPIHook(consumer_key,consumer_secret,
                          access_token=access_token,access_token_secret=access_token_secret)

#creates a Twitter User object to fill with information from the API
user = TwitterUser(api_hook,screen_name=username_to_collect_data_for)


# we call populate_tweets_from_api,which goes to the Twitter API
# and collects the user's data it is outputted to the file username_you_put.json
# the sleep_var param tells the function it shouldn't worry
# about rate limits (we're only collecting for one user, so it doesn't really matter
# If you remove the is_gzip argument, the output file will be gzipped
print 'populating users tweets!'
user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json",
                              sleep_var=False, is_gzip=False, since_id=None)


for t in user.tweets:
    print t.mentions
print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets))
# Get the handles to the Twitter API
handles = get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt")))
print 'n authed users: ', len(handles)

out_dir = sys.argv[2]
os.mkdir(out_dir)

#user_sns = [line.strip() for line in open(sys.argv[3]).readlines()]
user_sns = ['Neuro_Skeptic']

print 'num users: ', len(user_sns)

of = codecs.open("output_fil.tsv","w","utf8")
for i in range(len(user_sns)):
    #creates a Twitter User object to fill with information from the API
    user = TwitterUser(handles[i], screen_name=user_sns[i])
    user.populate_tweets_from_api(json_output_filename=out_dir+user_sns[i]+".json",
                                  sleep_var=False)
    user.populate_followers()
    rts = 0
    gt = 0
    for t in user.tweets:
        if t.retweeted is not None:
            rts+=1
        if t.geocode_info is not None:
            gt +=1

    of.write(tab_stringify_newline([user.screen_name,
                                 gt,
                                 rts,
                                len(user.tweets),
from twitter_dm.TwitterAPIHook import TwitterAPIHook

username_to_collect_data_for = '_kenny_joseph'

consumer_key = "YOUR_CONSUMER_KEY_HERE"
consumer_secret = "YOUR_CONSUMER_SECRET_HERE"
access_token = "YOUR_ACCESS_TOKEN_HERE"
access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE"


## get a "hook", or connection, to the API using your consumer key/secret and access token/secret
api_hook = TwitterAPIHook(consumer_key,consumer_secret,
                          access_token=access_token,access_token_secret=access_token_secret)

#creates a Twitter User object to fill with information from the API
user = TwitterUser(api_hook, screen_name=username_to_collect_data_for)


# we call populate_tweets_from_api,which goes to the Twitter API
# and collects the user's data it is outputted to the file username_you_put.json
# the sleep_var param tells the function it shouldn't worry
# about rate limits (we're only collecting for one user, so it doesn't really matter
# If you remove the is_gzip argument, the output file will be gzipped
print 'populating users tweets!'
user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json",
                              sleep_var=False, is_gzip=False)
print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets))

# we now will collect the user's followers
print 'populating user followers!'
user.populate_followers(sleep_var=False)
Esempio n. 16
0
elif args.screen_name:
    print 'Running with screen name: ', args.screen_name
    args.json_file_or_folder = os.path.join(OUTPUT_DIR,
                                            args.screen_name + ".json.gz")
    if os.path.exists(args.json_file_or_folder):
        print "User's tweets already in the system at: ", args.json_file_or_folder
    else:
        print "Getting user's tweets and saving to: ", args.json_file_or_folder
        if not args.path_to_twitter_credentials_file:
            print "Can't do anything with a screen name without some API credentials, see the help for this script " \
                  "and this parameter!"
            sys.exit(-1)

        app_handler = TwitterApplicationHandler(
            pathToConfigFile=args.path_to_twitter_credentials_file)
        user = TwitterUser(screen_name=args.screen_name,
                           api_hook=app_handler.api_hooks[0])
        user.populate_tweets_from_api(
            json_output_filename=args.json_file_or_folder, sleep_var=False)

########
# load the models and the files
########

print 'LOADING MODEL'
identity_model, feature_names = get_identity_model_and_features()

word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data(
    GENSIM_MODEL_LOCATION, BROWN_CLUSTER_LOCATION)
print 'MODEL HAS BEEN LOADED'

Esempio n. 17
0
def gen_conll_file(fil, ptb_dir, dp_dir):
    user = TwitterUser()
    user.populate_tweets_from_file(fil, do_tokenize=False)

    if 50 <= user.n_total_tweets <= 15000 and\
       user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        dp_filename = os.path.join(dp_dir, str(user.user_id) + ".gz")
        ptb_filename = os.path.join(ptb_dir, str(user.user_id) + ".txt.gz")

        if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename):
            return [
                'no_dp_ptb',
                [
                    user.user_id,
                    os.path.exists(dp_filename),
                    os.path.exists(ptb_filename)
                ]
            ]

        penntreebank = {
            x[0]: x[1:]
            for x in read_grouped_by_newline_file(ptb_filename)
        }
        dependency_parse = read_grouped_by_newline_file(dp_filename)

        tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\
                       len(t.urls) == 0 and 'http:' not in t.text and\
                       langid.classify(t.text)[0] == 'en']

        # non english speaker or spam
        if len(tweet_set) < 40:
            return ['notweets', user.user_id]

        data_to_return = []
        for twit_it, tweet in tweet_set:

            data_for_tweet = []

            ptb_for_tweet = penntreebank[str(tweet.id)]
            dp_for_tweet = dependency_parse[twit_it]

            if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(
                    dp_for_tweet[0]).text:
                print 'ahhhhh, weird stuff'
                continue

            for i, p in enumerate(dp_for_tweet):
                d = DependencyParseObject(
                    tsn([
                        p, tweet.id, user.user_id,
                        tweet.created_at.strftime("%m-%d-%y")
                    ],
                        newline=False))
                # get java features
                spl_java = ptb_for_tweet[i].split("\t")
                java_id, penn_pos_tag, word = spl_java[:3]
                java_features = '' if len(spl_java) == 3 else spl_java[3]
                d.features += [x for x in java_features.split("|") if x != '']
                d.features.append("penn_treebank_pos=" + penn_pos_tag)
                data_for_tweet.append(d)
            data_to_return.append(data_for_tweet)

        return ['success', [user.user_id, data_to_return]]
    else:
        return ['baduser', user.user_id]
    else:
        print 'Running with json file: ', args.json_file_or_folder
elif args.screen_name:
    print 'Running with screen name: ', args.screen_name
    args.json_file_or_folder = os.path.join(OUTPUT_DIR,args.screen_name+".json.gz")
    if os.path.exists(args.json_file_or_folder):
        print "User's tweets already in the system at: ", args.json_file_or_folder
    else:
        print "Getting user's tweets and saving to: ", args.json_file_or_folder
        if not args.path_to_twitter_credentials_file:
            print "Can't do anything with a screen name without some API credentials, see the help for this script " \
                  "and this parameter!"
            sys.exit(-1)

        app_handler = TwitterApplicationHandler(pathToConfigFile=args.path_to_twitter_credentials_file)
        user = TwitterUser(screen_name=args.screen_name,
                           api_hook=app_handler.api_hooks[0])
        user.populate_tweets_from_api(json_output_filename=args.json_file_or_folder,sleep_var=False)

########
# load the models and the files
########

print 'LOADING MODEL'
identity_model,feature_names = get_identity_model_and_features()

word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data(GENSIM_MODEL_LOCATION,
                                                                               BROWN_CLUSTER_LOCATION)
print 'MODEL HAS BEEN LOADED'

def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None):
    """
    def run(self):
        print('Worker started')
        # do some initialization here

        while True:
            data = self.queue.get(True)
            try:
                if data is None:
                    print('ALL FINISHED!!!!', self.conn_number)
                    break

                print('Starting: ', data)
                if self.gets_user_id:
                    user = TwitterUser(self.api_hook, user_id=data)
                else:
                    user = TwitterUser(self.api_hook, screen_name=data)

                user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir,"json"))

                if len(user.tweets) == 0:
                    if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers:
                        print 'pickling and dumping: ', user.screen_name
                        pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb"))
                    continue
                if self.populate_lists:
                    user.populate_lists_member_of()

                if self.populate_friends:
                    print 'populating friends, ', user.screen_name
                    user.populate_friends()

                if self.populate_followers:
                    print 'populating followers, ', user.screen_name
                    user.populate_followers()

                if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers:
                    # Pickle and dump user
                    print 'pickling and dumping (no tweets): ', user.screen_name
                    user.tweets = []
                    pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb"))
            except Exception:
                print('FAILED:: ', data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=30, file=sys.stdout)
                print("*** print_exception:")

            print('finished collecting data for: ', data)
    def run(self):
        print ("Worker started")

        while True:

            try:
                data = self.queue.get(True)
                if data is None:
                    print "ALL DONE, EXITING!"
                    return

                user_id, screen_name = data[0], data[1]
                print ("Starting: ", screen_name, user_id)

                this_user_network_dir_name = os.path.join(self.network_dir, user_id)
                mkdir_no_err(this_user_network_dir_name)

                stored_user_list = set(
                    [os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")]
                )

                # Get the ego
                if user_id in stored_user_list:
                    print ("\tgot pickled: ", user_id)
                    user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb"))
                else:
                    user = TwitterUser(self.api_hook, user_id=user_id)
                    print ("\tgetting tweets for: ", user_id)
                    user.populate_tweets_from_api()
                    print ("\t num tweets received for: ", user_id, " (", screen_name, "): ", len(user.tweets))
                    if len(user.tweets) > 0:
                        print ("\tgetting lists, friends, followers for: ", user_id)
                        user.populate_lists_member_of()
                        # user.populate_followers()
                        # user.populate_friends()

                    print ("pickling: ", screen_name)
                    pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb"))

                self.write_user_network(this_user_network_dir_name, user, user_id, None)

                if len(user.tweets) == 0:
                    print ("finished collecting data for: ", user_id, ", no tweets")
                    continue

                # Find the ego network based on retweets, mentions and replies
                user_network_to_pull = user.get_ego_network_actors()

                print ("Starting to get ", user.user_id, "'s network of ", len(user_network_to_pull), " actors")
                restrict_to_users = [u for u in user_network_to_pull]
                restrict_to_users.append(user_id)

                self.get_user_network(
                    this_user_network_dir_name, user_network_to_pull, restrict_to_users, stored_user_list
                )
            except Exception:
                print ("FAILED:: ", data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print ("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=50, file=sys.stdout)

            print ("finished collecting data for: ", screen_name)
Esempio n. 21
0
from twitter_dm.TwitterUser import TwitterUser
import datetime
from time import mktime
u = TwitterUser()
u.populate_tweets_from_file("/Users/kennyjoseph/git/thesis/twitter_dm/examples/2431225676.json.gz")

for t in u.tweets:
    print mktime(t.created_at.timetuple())
    def run(self):
        print('Worker started')
        # do some initialization here
        snow_sample_number = None
        since_tweet_id = None
        while True:
            data = self.queue.get(True)

            try:
                if data is None:
                    print 'ALL FINISHED!!!!'
                    break

                if len(data) == 1 or type(data) is str or type(
                        data) is unicode or type(data) is int:
                    user_identifier = data
                elif len(data) == 3:
                    user_identifier, snow_sample_number, since_tweet_id = data
                elif len(data) == 2:
                    if self.step_count:
                        user_identifier, snow_sample_number = data
                    elif self.gets_since_tweet_id:
                        user_identifier, since_tweet_id = data

                user_identifier = str(user_identifier)

                print 'Starting: ', data

                pickle_filename = os.path.join(self.out_dir, "obj",
                                               user_identifier)
                json_filename = os.path.join(self.out_dir, "json",
                                             user_identifier + ".json.gz")

                # Get the user's data
                if os.path.exists(pickle_filename) and os.path.exists(
                        json_filename) and not self.add_to_file:
                    print '\tgot existing data for: ', data
                    user = pickle.load(open(pickle_filename, "rb"))
                    user.populate_tweets_from_file(json_filename)
                else:
                    if self.gets_user_id:
                        user = TwitterUser(self.api_hook,
                                           user_id=user_identifier)
                    else:
                        user = TwitterUser(self.api_hook,
                                           screen_name=user_identifier)

                    print 'populating tweets', user_identifier

                    if self.populate_tweets:
                        if self.save_user_tweets:
                            print 'saving tweets to: ', json_filename
                            of_name, tweet_count = user.populate_tweets_from_api(
                                json_output_filename=json_filename,
                                since_id=since_tweet_id,
                                populate_object_with_tweets=False)
                        else:
                            of_name, tweet_count = user.populate_tweets_from_api(
                                since_id=since_tweet_id,
                                populate_object_with_tweets=False)

                        if self.tweet_count_file:
                            self.tweet_count_file.write(
                                str(user_identifier) + "\t" +
                                str(tweet_count) + "\n")

                    if self.populate_lists:
                        print 'populating lists', user.screen_name
                        user.populate_lists_member_of()

                    if self.populate_friends:
                        print 'populating friends, ', user.screen_name
                        user.populate_friends()

                    if self.populate_followers:
                        print 'populating followers, ', user.screen_name
                        user.populate_followers()

                    if self.save_user_data and \
                        (self.always_pickle or self.populate_lists
                         or self.populate_friends or self.populate_followers):
                        # Pickle and dump user
                        #print 'pickling and dumping (no tweets): ', user.screen_name
                        user.tweets = []
                        pickle.dump(user, open(pickle_filename, "wb"))

                # now add to queue if necessary
                if snow_sample_number is not None and snow_sample_number < self.step_count:
                    for user_identifier in self.add_users_to_queue_function(
                            user):
                        self.queue.put(
                            [str(user_identifier), snow_sample_number + 1])

                if self.post_process_function:
                    self.post_process_function(user)

            except KeyboardInterrupt as e:
                print e
                break
            except Exception:
                print('FAILED:: ', data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=30, file=sys.stdout)
                print("*** print_exception:")