def gen_json_for_tweets_of_interest(data, identity_list): of_id, uid_list = data json_of_name = os.path.join(JSON_OUTPUT_DIRECTORY, str(of_id) + ".json.gz") print 'inp: ', json_of_name, len(uid_list), uid_list[0:2] tweets_to_write = [] if not os.path.exists(json_of_name): for i, uid in enumerate(uid_list): if i % 25 == 0: print i, len(tweets_to_write) try: u = TwitterUser() u.populate_tweets_from_file(os.path.join( JSON_INPUT_DIRECTORY, uid + ".json.gz"), store_json=True) tweets_to_keep = [] for t in u.tweets: if not t.retweeted and len(t.tokens) > 4: expanded_token_set = copy(t.tokens) for token in t.tokens: expanded_token_set += get_alternate_wordforms( token) if len(set(expanded_token_set) & identity_list): tweets_to_keep.append(t) tweets_to_write += tweets_to_keep except: print 'FAILED JSON FOR USER: '******'WRITING JSON' out_fil = gzip.open(json_of_name, "wb") for tweet in tweets_to_write: out_fil.write( json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def gen_conll_file(fil,ptb_dir, dp_dir): user = TwitterUser() user.populate_tweets_from_file(fil, do_tokenize=False) if 50 <= user.n_total_tweets <= 15000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: dp_filename = os.path.join(dp_dir,str(user.user_id)+".gz") ptb_filename = os.path.join(ptb_dir,str(user.user_id)+".txt.gz") if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename): return ['no_dp_ptb',[user.user_id,os.path.exists(dp_filename),os.path.exists(ptb_filename)]] penntreebank = {x[0] : x[1:] for x in read_grouped_by_newline_file(ptb_filename)} dependency_parse = read_grouped_by_newline_file(dp_filename) tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ langid.classify(t.text)[0] == 'en'] # non english speaker or spam if len(tweet_set) < 40: return ['notweets',user.user_id] data_to_return = [] for twit_it, tweet in tweet_set: data_for_tweet = [] ptb_for_tweet = penntreebank[str(tweet.id)] dp_for_tweet = dependency_parse[twit_it] if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(dp_for_tweet[0]).text: print 'ahhhhh, weird stuff' continue for i, p in enumerate(dp_for_tweet): d = DependencyParseObject(tsn([p,tweet.id,user.user_id,tweet.created_at.strftime("%m-%d-%y")],newline=False)) # get java features spl_java = ptb_for_tweet[i].split("\t") java_id, penn_pos_tag,word = spl_java[:3] java_features = '' if len(spl_java) == 3 else spl_java[3] d.features += [x for x in java_features.split("|") if x != ''] d.features.append("penn_treebank_pos="+penn_pos_tag) data_for_tweet.append(d) data_to_return.append(data_for_tweet) return ['success', [user.user_id,data_to_return]] else: return ['baduser',user.user_id]
def gen_output(data, json_data_dir): term, is_reply, tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir, "*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f, store_json=True, do_arabic_stemming=False, lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset, open(name + ".p", 'wb')) print 'done with: ', name, is_reply return else: print 'failed user'
def gen_json_for_tweets_of_interest(input_filename, output_filename, keep_only_tweets_with_terms=None): """ This function generates a cleaned json file so that the identity extraction only happens on "interesting" tweets. Right now, interesting is defined as non-retweets that have >4 tokens. Feel free to redefine as you feel is suitable :param input_filename: input json file name (Can be gzipped) :param output_filename: cleaned output json filename :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific set of terms, you can use this argument and pass in a set of terms here :return: """ tweets_to_write = [] if not os.path.exists(output_filename): try: u = TwitterUser() u.populate_tweets_from_file(input_filename, store_json=True) tweets = [ t for t in u.tweets if not t.retweeted and len(t.tokens) > 4 ] tweets_to_keep = [] if keep_only_tweets_with_terms: expanded_token_set = copy(t.tokens) for t in tweets: for token in t.tokens: expanded_token_set += get_alternate_wordforms(token) if len( set(expanded_token_set) & keep_only_tweets_with_terms): tweets_to_keep.append(t) else: tweets_to_keep = tweets tweets_to_write += tweets_to_keep except: print 'FAILED TO PARSE JSON FILE: ', input_filename print 'WRITING JSON' out_fil = gzip.open(output_filename, "wb") for tweet in tweets_to_write: out_fil.write( json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def gen_output(data, json_data_dir): term,is_reply,tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir,"*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset,open(name+".p",'wb')) print 'done with: ',name, is_reply return else: print 'failed user'
def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None): """ This function generates a cleaned json file so that the identity extraction only happens on "interesting" tweets. Right now, interesting is defined as non-retweets that have >4 tokens. Feel free to redefine as you feel is suitable :param input_filename: input json file name (Can be gzipped) :param output_filename: cleaned output json filename :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific set of terms, you can use this argument and pass in a set of terms here :return: """ tweets_to_write = [] if not os.path.exists(output_filename): try: u = TwitterUser() u.populate_tweets_from_file(input_filename,store_json=True) tweets = [t for t in u.tweets if not t.retweeted and len(t.tokens) > 4] tweets_to_keep = [] if keep_only_tweets_with_terms: expanded_token_set = copy(t.tokens) for t in tweets: for token in t.tokens: expanded_token_set += get_alternate_wordforms(token) if len(set(expanded_token_set) & keep_only_tweets_with_terms): tweets_to_keep.append(t) else: tweets_to_keep = tweets tweets_to_write += tweets_to_keep except: print 'FAILED TO PARSE JSON FILE: ', input_filename print 'WRITING JSON' out_fil = gzip.open(output_filename, "wb") for tweet in tweets_to_write: out_fil.write(json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def getTweets(twitterid): ''' Function to get the twitter data for an individual twitter ID. This function is written to work with Kenny's github example here: https://github.com/kennyjoseph/twitter_dm Input: string of twitterID Output: list of the raw string of all tweets for twitterID ''' from twitter_dm.TwitterUser import TwitterUser tweets = [] u = TwitterUser() u.populate_tweets_from_file( twitterid + '.json' ) #Need to figure out of we can use numeric ID (123456789.json) or name (kenny_joseph.json) for t in u.tweets: tweets.append( t.tokens ) #not sure if tokens is exactly what we want, we want the raw words, not necessarily tokens. We'll check this. # # texts={} # source_filename='Datasets/Twitter/members.zip' # parser = etree.XMLParser(encoding='utf8',recover=True) # with zipfile.ZipFile(source_filename) as zf: # for i,member in enumerate(zf.infolist()): # name=member.filename.split('/')[1].split('.')[0] #filename is Raw3/name.csv # if idx ==name: # #print idx, name # raw=zf.open(member) # data=csv.reader(raw) # for j,line in enumerate(data): # if j>0: # texts[idx+'_'+str(j)]=line[0] # if texts=={}: # print 'no tweets for ', idx return tweets
from twitter_dm.TwitterUser import TwitterUser import datetime from time import mktime u = TwitterUser() u.populate_tweets_from_file("/Users/kennyjoseph/git/thesis/twitter_dm/examples/2431225676.json.gz") for t in u.tweets: print mktime(t.created_at.timetuple())
def gen_conll_file(fil, ptb_dir, dp_dir): user = TwitterUser() user.populate_tweets_from_file(fil, do_tokenize=False) if 50 <= user.n_total_tweets <= 15000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: dp_filename = os.path.join(dp_dir, str(user.user_id) + ".gz") ptb_filename = os.path.join(ptb_dir, str(user.user_id) + ".txt.gz") if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename): return [ 'no_dp_ptb', [ user.user_id, os.path.exists(dp_filename), os.path.exists(ptb_filename) ] ] penntreebank = { x[0]: x[1:] for x in read_grouped_by_newline_file(ptb_filename) } dependency_parse = read_grouped_by_newline_file(dp_filename) tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ langid.classify(t.text)[0] == 'en'] # non english speaker or spam if len(tweet_set) < 40: return ['notweets', user.user_id] data_to_return = [] for twit_it, tweet in tweet_set: data_for_tweet = [] ptb_for_tweet = penntreebank[str(tweet.id)] dp_for_tweet = dependency_parse[twit_it] if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject( dp_for_tweet[0]).text: print 'ahhhhh, weird stuff' continue for i, p in enumerate(dp_for_tweet): d = DependencyParseObject( tsn([ p, tweet.id, user.user_id, tweet.created_at.strftime("%m-%d-%y") ], newline=False)) # get java features spl_java = ptb_for_tweet[i].split("\t") java_id, penn_pos_tag, word = spl_java[:3] java_features = '' if len(spl_java) == 3 else spl_java[3] d.features += [x for x in java_features.split("|") if x != ''] d.features.append("penn_treebank_pos=" + penn_pos_tag) data_for_tweet.append(d) data_to_return.append(data_for_tweet) return ['success', [user.user_id, data_to_return]] else: return ['baduser', user.user_id]