import glob
import os
import sys
from multiprocessing import Queue

from twitter_dm.multiprocess import multiprocess_setup
from twitter_dm.multiprocess.WorkerTweetData import TweetDataWorker
from twitter_dm.utility import general_utils

from twitter_dm.utility.general_utils import mkdir_no_err,collect_system_arguments, chunk_data

handles, output_dir, tweet_ids, is_ids = collect_system_arguments(sys.argv)


# Create the output directory
mkdir_no_err(output_dir)

# chunk tweets into 100s (the API takes them by 100)
i = 0
tweets_chunked = chunk_data(tweet_ids)


print tweets_chunked[0]
# init a sync manager
multiprocess_setup.init_good_sync_manager()

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(tweets_chunked, len(handles))
# run!
processes = []
for i in range(len(handles)):
OUTPUT_DIRECTORY = sys.argv[2]

##get all the handles we have to the api
handles = general_utils.get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt")))

print 'n authed users: ', len(handles)

#user screen names we are interested in
user_screenname_id_pairs = [line.strip().split("\t") for line in open(sys.argv[3]).readlines()]

print user_screenname_id_pairs[0]

pickle_dir = OUTPUT_DIRECTORY +"/obj/"
network_dir = OUTPUT_DIRECTORY+"/net/"

general_utils.mkdir_no_err(OUTPUT_DIRECTORY)
general_utils.mkdir_no_err(pickle_dir)
general_utils.mkdir_no_err(network_dir)

multiprocess_setup.init_good_sync_manager()

##put data on the queue
request_queue = multiprocess_setup.load_request_queue(user_screenname_id_pairs, len(handles))


processes = []
for i in range(len(handles)):
    p = TwitterEgoNetworkWorker(request_queue, handles[i], network_dir, pickle_dir)
    p.start()
    processes.append(p)
if len(sys.argv) != 4:
    print 'usage:  [known_user_dir] [screen_name_file] [out_dir]'
    sys.exit(-1)

handles =[]
for fil in glob.glob(sys.argv[1]+"/*.txt"):
    print 'FIL: ' , fil
    app_handler = TwitterApplicationHandler(pathToConfigFile=fil)
    handles += app_handler.api_hooks

print 'n authed users: ', len(handles)

user_ids = set([line.strip().lower() for line in open(sys.argv[2]).readlines()])
out_dir = sys.argv[3]
general_utils.mkdir_no_err(out_dir)

print "N TO FIND: ", len(user_ids)

user_ids = [u for u in user_ids]

user_data_chunked = []
i=0
while i < len(user_ids):
    user_data_chunked.append(user_ids[i:(i+100)])
    i += 100

user_data_chunked.append(user_ids[i-100:len(user_ids)])

print 'len chunked: ', len(user_data_chunked)
from twitter_dm.multiprocess import multiprocess_setup
from twitter_dm.multiprocess.WorkerUserData import UserDataWorker
from datetime import datetime
from twitter_dm.utility.general_utils import mkdir_no_err, collect_system_arguments

(handles, out_dir, user_ids, is_ids, collect_friends, collect_followers,
 gen_tweet_counts_file) = collect_system_arguments(sys.argv, [
     'collect_friends (y/n)', 'collect_followers (y/n)',
     "gen_tweet_counts_file (y/n)"
 ])

handles = handles[:2]

print 'num users: ', len(user_ids)

mkdir_no_err(out_dir)
mkdir_no_err(os.path.join(out_dir, "obj"))
mkdir_no_err(os.path.join(out_dir, "json"))

multiprocess_setup.init_good_sync_manager()

##put data on the queue
request_queue = multiprocess_setup.load_request_queue(user_ids, len(handles))

tweet_count_file_dir = None
if gen_tweet_counts_file == 'y':
    tweet_count_file_dir = "tweet_count" + str(datetime.now()).split(" ")[0]
    mkdir_no_err(os.path.join(out_dir, tweet_count_file_dir))

processes = []
for i in range(len(handles)):
]

user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(
    user_sns, handles, True)
print 'got screen names, ', len(user_screenname_id_pairs)

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(
    [(x[1], 0) for x in user_screenname_id_pairs],
    len(handles),
    add_nones=False)

pickle_dir = OUTPUT_DIRECTORY + "/obj/"
network_dir = OUTPUT_DIRECTORY + "/json/"

general_utils.mkdir_no_err(OUTPUT_DIRECTORY)
general_utils.mkdir_no_err(pickle_dir)
general_utils.mkdir_no_err(network_dir)

multiprocess_setup.init_good_sync_manager()

# put data on the queue
user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(
    user_sns, handles, True)
print 'got screen names, ', len(user_screenname_id_pairs)

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(
    [(x[1], 0) for x in user_screenname_id_pairs],
    len(handles),
    add_nones=False)
    freeze_support()

    if len(sys.argv) != 4:
        print "usage:  [known_user_dir] [output_dir] [tweet_id_file]"
        sys.exit(-1)

    handles = general_utils.get_handles(glob.glob(os.path.join(sys.argv[1], "*.txt")))
    print "n authed users: ", len(handles)

    out_dir = sys.argv[2]

    user_ids = [line.strip().split(",")[0] for line in open(sys.argv[3]).readlines()]

    print "num users: ", len(user_ids)

    general_utils.mkdir_no_err(out_dir)
    general_utils.mkdir_no_err(os.path.join(out_dir, "obj"))
    general_utils.mkdir_no_err(os.path.join(out_dir, "json"))
    multiprocess_setup.init_good_sync_manager()

    # already_done = set([os.path.basename(f) for f in glob.glob(out_dir+"/*")])
    print "len already done:", 0
    # user_screennames = [u for u in user_screennames if u not in already_done]

    ##put data on the queue
    request_queue = multiprocess_setup.load_request_queue(user_ids, len(handles))

    processes = []
    for i in range(len(handles)):
        p = UserDataWorker(
            request_queue,
    def run(self):
        print ("Worker started")

        while True:

            try:
                data = self.queue.get(True)
                if data is None:
                    print "ALL DONE, EXITING!"
                    return

                user_id, screen_name = data[0], data[1]
                print ("Starting: ", screen_name, user_id)

                this_user_network_dir_name = os.path.join(self.network_dir, user_id)
                mkdir_no_err(this_user_network_dir_name)

                stored_user_list = set(
                    [os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")]
                )

                # Get the ego
                if user_id in stored_user_list:
                    print ("\tgot pickled: ", user_id)
                    user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb"))
                else:
                    user = TwitterUser(self.api_hook, user_id=user_id)
                    print ("\tgetting tweets for: ", user_id)
                    user.populate_tweets_from_api()
                    print ("\t num tweets received for: ", user_id, " (", screen_name, "): ", len(user.tweets))
                    if len(user.tweets) > 0:
                        print ("\tgetting lists, friends, followers for: ", user_id)
                        user.populate_lists_member_of()
                        # user.populate_followers()
                        # user.populate_friends()

                    print ("pickling: ", screen_name)
                    pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb"))

                self.write_user_network(this_user_network_dir_name, user, user_id, None)

                if len(user.tweets) == 0:
                    print ("finished collecting data for: ", user_id, ", no tweets")
                    continue

                # Find the ego network based on retweets, mentions and replies
                user_network_to_pull = user.get_ego_network_actors()

                print ("Starting to get ", user.user_id, "'s network of ", len(user_network_to_pull), " actors")
                restrict_to_users = [u for u in user_network_to_pull]
                restrict_to_users.append(user_id)

                self.get_user_network(
                    this_user_network_dir_name, user_network_to_pull, restrict_to_users, stored_user_list
                )
            except Exception:
                print ("FAILED:: ", data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print ("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=50, file=sys.stdout)

            print ("finished collecting data for: ", screen_name)
        print 'usage:  [known_user_dir] [output_dir] [tweet_id_file]'
        sys.exit(-1)

    handles = general_utils.get_handles(
        glob.glob(os.path.join(sys.argv[1], "*.txt")))
    print 'n authed users: ', len(handles)

    out_dir = sys.argv[2]

    user_ids = [
        line.strip().split(",")[0] for line in open(sys.argv[3]).readlines()
    ]

    print 'num users: ', len(user_ids)

    general_utils.mkdir_no_err(out_dir)
    general_utils.mkdir_no_err(os.path.join(out_dir, "obj"))
    general_utils.mkdir_no_err(os.path.join(out_dir, "json"))
    multiprocess_setup.init_good_sync_manager()

    #already_done = set([os.path.basename(f) for f in glob.glob(out_dir+"/*")])
    print 'len already done:', 0
    #user_screennames = [u for u in user_screennames if u not in already_done]

    ##put data on the queue
    request_queue = multiprocess_setup.load_request_queue(
        user_ids, len(handles))

    processes = []
    for i in range(len(handles)):
        p = UserDataWorker(request_queue,
Esempio n. 9
0
import cPickle as pickle
import sys
from collections import Counter
from multiprocessing import Pool
from os import listdir, mkdir
import os
from twitter_dm.utility.general_utils import tab_stringify_newline as tsn, mkdir_no_err

if len(sys.argv) != 4:
    print 'usage:  [input_dir] [output dir] [# cores for execution]'
    sys.exit(-1)

INPUT_DIR = sys.argv[1]
OUTPUT_DIR = sys.argv[2]
mkdir_no_err(OUTPUT_DIR)


def get_user_info(d):

    #i, uid = d
    #if i % 1000 == 0:
    #    print i
    #try:
    i, uid = d
    u = pickle.load(open(os.path.join(INPUT_DIR, 'obj', uid), 'rb'))
    fname = os.path.join(INPUT_DIR, 'json', uid + '.json.gz')

    u.populate_tweets_from_file(fname,
                                store_json=False,
                                do_arabic_stemming=False,
Esempio n. 10
0
Basically, we don't get information back from the API if these users have been suspended/deleted, so
we can learn from that information
"""
__author__ = 'kjoseph'

import glob
import sys

from twitter_dm.utility.general_utils import collect_system_arguments, chunk_data
from twitter_dm.multiprocess import multiprocess_setup
from twitter_dm.multiprocess.WorkerSimpleUserLookup import SimpleUserLookupWorker
from twitter_dm.utility import general_utils

handles, out_dir, data_to_collect, is_ids = collect_system_arguments(sys.argv)

general_utils.mkdir_no_err(out_dir)

user_data_chunked = chunk_data(data_to_collect)
print 'len chunked: ', len(user_data_chunked)

# initialize a better sync manager
multiprocess_setup.init_good_sync_manager()

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(
    [x for x in user_data_chunked], len(handles), add_nones=True)

processes = []
for i in range(len(handles)):
    p = SimpleUserLookupWorker(request_queue,
                               handles[i],
                    tweet_has_identity = True
                tweet.append(x.get_conll_form() + "\t" + lab)
                i += 1
            if tweet_has_identity:
                outfil.write(("\n".join(tweet) +"\n\n").encode("utf8"))

        outfil.close()
        return ['success', final_out_filename]
    except:
        print 'UNKNOWN ERROR: ', json_file_name
        return ['no_dp_ptb',False,False]




mkdir_no_err(OUTPUT_DIR)

users_to_ignore = open("results/u_ignore.txt","w")
users_no_tweets = open("results/u_notweets.txt","w")
users_need_dp = open("results/u_needdp.txt","w")
users_need_ptb = open("results/u_need_ptb.txt","w")


word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data(
                                                                    'gensim_model/glove_twitter_50_raw_model.txt.gz',
                                                                   "processed_data/50mpaths2",
                                                                   "dictionaries/*/*",
                                                                   BOOTSTRAPPED_DICTIONARY_LOCATION)
CONLL_FILE = "processed_data/all_conll_pub_and_nonpub.txt"
features_from_conll_file, dict_for_filter = get_all_features(CONLL_FILE,
                                                             all_dictionaries,
from twitter_dm.utility.general_utils import mkdir_no_err
from glob import glob
from multiprocessing import Pool
import os

CPU_COUNT = 2
TWEEBOPARSER_LOC= 'PATH_TO_TWEEBO_PARSER'
DATA_DIR = "PATH_TO_DIRECTORY_OF_(GZIPPED)_JSON_FILES_WITH_TWEETS"



def do_dependency_parse(fil):
    u = TwitterUser()
    u.populate_tweets_from_file(fil,do_tokenize=False)
    out_file_name = fil.replace(".json","").replace(".gz","").replace("/json/","/dep_parse/")
    print out_file_name

    if len(u.tweets) == 0:
        os.utime(out_file_name)
        return 'empty, success'

    data = dependency_parse_tweets(TWEEBOPARSER_LOC,u.tweets,out_file_name)
    return 'completed'


mkdir_no_err(DATA_DIR.replace("json","dep_parse"))
pool = Pool(processes=CPU_COUNT)

#do_dependency_parse(glob(DATA_DIR+"/*")[0])
result = pool.map(do_dependency_parse, glob(DATA_DIR+"/*"))
Esempio n. 13
0
from twitter_dm import TwitterUser
from twitter_dm import dependency_parse_tweets
from twitter_dm.utility.general_utils import mkdir_no_err

CPU_COUNT = 2
TWEEBOPARSER_LOC = 'PATH_TO_TWEEBO_PARSER'
DATA_DIR = "PATH_TO_DIRECTORY_OF_(GZIPPED)_JSON_FILES_WITH_TWEETS"


def do_dependency_parse(fil):
    u = TwitterUser()
    u.populate_tweets_from_file(fil, do_tokenize=False)
    out_file_name = fil.replace(".json", "").replace(".gz", "").replace(
        "/json/", "/dep_parse/")
    print out_file_name

    if len(u.tweets) == 0:
        os.utime(out_file_name)
        return 'empty, success'

    data = dependency_parse_tweets(TWEEBOPARSER_LOC, u.tweets, out_file_name)
    return 'completed'


mkdir_no_err(DATA_DIR.replace("json", "dep_parse"))
pool = Pool(processes=CPU_COUNT)

#do_dependency_parse(glob(DATA_DIR+"/*")[0])
result = pool.map(do_dependency_parse, glob(DATA_DIR + "/*"))