def getRelTweets(newsID, dtpure, tweetPre, tweetIDset, tweetSet): t_path = glob.glob(tweetPre + dtpure + "/" + str(newsID) + "_*") if len(t_path) != 1: print('no tweets for news ', newsID, 'len(t_path)', len(t_path)) return ([], []) if os.path.exists(t_path[0]): t = codecs.open(t_path[0], encoding='utf-8') tweets = [] tweetsObj = [] # stupid redundancy for line in t: fields = line.strip().split("\t") if len(fields) < 24: # tweets_log.write("not 27:"+line.strip()+"\n") continue ID, raw_text, created_at, contained_url, hash_tags, retw_id_str, retw_favorited, retw_favorite_count, is_retweet, retweet_count, \ tw_favorited, tw_favorite_count, tw_retweeted, tw_retweet_count, user_id_str, verified, follower_count, statuses_count, friends_count, \ favorites_count, user_created_at= fields[:21] try: ID = int(ID) except: continue try: is_retweet = bool(is_retweet) except: is_retweet = False try: retweet_count = int(retweet_count) except: retweet_count = -1 # convert url to http if raw_text.split()[-1].startswith('http'): raw_text = raw_text.split('http')[0] + 'http' #if "http" not in raw_text and "RT @" not in raw_text \ raw_text = unidecode.unidecode(raw_text) if not raw_text.startswith('RT @') \ and ID not in tweetIDset and raw_text not in tweetSet: tweet = bk.Tweet(ID, raw_text, created_at, is_retweet, retweet_count, hash_tags) tweetsObj.append(tweet) tweets.append(raw_text) tweetIDset.add(ID) tweetSet.add(raw_text) t.close() return (tweets, tweetsObj)
def getRelTweets(newsID,dtpure,tweetPre,tweetsObj): t_path = glob.glob(tweetPre+dtpure+"/"+str(newsID)+"_*") if len(t_path) != 1: print('no tweets for news ',newsID,'len(t_path)',len(t_path)) return if os.path.exists(t_path[0]): t = codecs.open(t_path[0], encoding = 'utf-8') # stupid redundancy for line in t: fields = line.strip().split("\t") if len(fields) < 24: # tweets_log.write("not 27:"+line.strip()+"\n") continue ID, raw_text, created_at, contained_url, hash_tags, retw_id_str, retw_favorited, retw_favorite_count, is_retweet, retweet_count, \ tw_favorited, tw_favorite_count, tw_retweeted, tw_retweet_count, user_id_str, verified, follower_count, statuses_count, friends_count, \ favorites_count, user_created_at= fields[:21] try: ID = int(ID) except: continue if ID in tweetsObj or len(raw_text.split())<=5: continue try: is_retweet=bool(is_retweet) except: is_retweet=False try: retweet_count = int(retweet_count) except: retweet_count = -1 # convert url to http if raw_text.split()[-1].startswith('http'): raw_text = raw_text.split('http')[0] + 'http' raw_text=unidecode.unidecode(raw_text) tweet = bk.Tweet(ID,raw_text,created_at,is_retweet,retweet_count,hash_tags,pop=1) tweetsObj[ID] = tweet t.close()
def getRelTweets(newsID, dtpure, tweetPre, tweetIDset, tweetSet): #n = News.objects.filter(ID=newsID) #if n.count() > 0: # #return News.objects.get(ID=newsID).tweet_set.all() # return list(n[0].tweet_set.all()) #else: # return [] t_path = glob.glob(tweetPre + dtpure + "/" + str(newsID) + "_*") if len(t_path) != 1: print('no tweets for news ', newsID, 'len(t_path)', len(t_path)) return ([], []) if os.path.exists(t_path[0]): t = codecs.open(t_path[0], encoding='utf-8') #tweets = set() tweets = [] tweetsObj = [] # stupid redundancy for line in t: fields = line.strip().split("\t") if len(fields) < 24: # tweets_log.write("not 27:"+line.strip()+"\n") continue ID, raw_text, created_at, contained_url, hash_tags, retw_id_str, retw_favorited, retw_favorite_count, is_retweet, retweet_count, \ tw_favorited, tw_favorite_count, tw_retweeted, tw_retweet_count, user_id_str, verified, follower_count, statuses_count, friends_count, \ favorites_count, user_created_at= fields[:21] try: ID = int(ID) except: continue try: is_retweet = bool(is_retweet) except: is_retweet = False try: retweet_count = int(retweet_count) except: retweet_count = -1 #if len(tag_text) > 100: # tweets_log.write("hashtag too long: "+line.strip()+"\n") # continue #if len(tw_text) > 200: # tweets_log.write("tweet too long: "+line.strip()+"\n") # continue # ## convert user_created time ## Fri Nov 07 22:20:38 +0000 2014 #tw_created_at_tz = parse(tw_created_at) # utc time with tz information #tw_local_timezone = tw_created_at[len(tw_created_at)-10:len(tw_created_at)-5] # +0000 # tweets.append(tw_text) #s = tw_text.find("http://") #if s == -1: # s = tw_text.find("https://") #if s != -1: # tmp = tw_text[s:] # e = tmp.find(" ") # if e == -1: # e = len(tmp) # tw_text = (tw_text[:s].strip()+ " " + tmp[e:].strip()).strip() #if s != 0 and tw_text[s-1] != " " and tw_text[s-1] != "\t": # tw_text = tw_text[:s] + tmp[e:] #else: # tw_text = tw_text[:s] + tmp[e+1:] # remove url #tw_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tw_text) if "http" not in raw_text and "RT @" not in raw_text \ and ID not in tweetIDset and raw_text not in tweetSet: tweet = bk.Tweet(ID, raw_text, created_at, is_retweet, retweet_count, hash_tags) tweetsObj.append(tweet) tweets.append(raw_text) tweetIDset.add(ID) tweetSet.add(raw_text) # tweets.add(tw_text) #tweet = Tweet(ID=int(tw_id_str), user=int(user_id_str) ,raw_text = tw_text,created_at = tw_created_at_tz, local_time_zone = tw_local_timezone, retweet_count = tw_retweet_count,\ #hash_tags = tag_text) t.close() # if tweets: return (tweets, tweetsObj)