def getRelTweets(newsID, dtpure, tweetPre, tweetIDset, tweetSet):
    t_path = glob.glob(tweetPre + dtpure + "/" + str(newsID) + "_*")
    if len(t_path) != 1:
        print('no tweets for news ', newsID, 'len(t_path)', len(t_path))
        return ([], [])
    if os.path.exists(t_path[0]):
        t = codecs.open(t_path[0], encoding='utf-8')
    tweets = []
    tweetsObj = []
    # stupid redundancy
    for line in t:
        fields = line.strip().split("\t")
        if len(fields) < 24:
            #    tweets_log.write("not 27:"+line.strip()+"\n")
            continue
        ID, raw_text, created_at, contained_url, hash_tags, retw_id_str, retw_favorited, retw_favorite_count, is_retweet, retweet_count, \
        tw_favorited, tw_favorite_count, tw_retweeted, tw_retweet_count, user_id_str, verified, follower_count, statuses_count, friends_count, \
    favorites_count, user_created_at= fields[:21]
        try:
            ID = int(ID)
        except:
            continue
        try:
            is_retweet = bool(is_retweet)
        except:
            is_retweet = False
        try:
            retweet_count = int(retweet_count)
        except:
            retweet_count = -1
        # convert url to http
        if raw_text.split()[-1].startswith('http'):
            raw_text = raw_text.split('http')[0] + 'http'
        #if "http" not in raw_text and "RT @" not in raw_text \
        raw_text = unidecode.unidecode(raw_text)
        if not raw_text.startswith('RT @') \
            and ID not in tweetIDset and raw_text not in tweetSet:
            tweet = bk.Tweet(ID, raw_text, created_at, is_retweet,
                             retweet_count, hash_tags)
            tweetsObj.append(tweet)
            tweets.append(raw_text)
            tweetIDset.add(ID)
            tweetSet.add(raw_text)
    t.close()
    return (tweets, tweetsObj)
Example #2
0
def getRelTweets(newsID,dtpure,tweetPre,tweetsObj):
    t_path = glob.glob(tweetPre+dtpure+"/"+str(newsID)+"_*")
    if len(t_path) != 1:
        print('no tweets for news ',newsID,'len(t_path)',len(t_path))
        return
    if os.path.exists(t_path[0]):
        t = codecs.open(t_path[0], encoding = 'utf-8') 
# stupid redundancy
    for line in t:
        fields = line.strip().split("\t")
        if len(fields) < 24:
        #    tweets_log.write("not 27:"+line.strip()+"\n")
            continue
        ID, raw_text, created_at, contained_url, hash_tags, retw_id_str, retw_favorited, retw_favorite_count, is_retweet, retweet_count, \
        tw_favorited, tw_favorite_count, tw_retweeted, tw_retweet_count, user_id_str, verified, follower_count, statuses_count, friends_count, \
    favorites_count, user_created_at= fields[:21]
        try:
            ID = int(ID)
        except:
            continue

        if ID in tweetsObj or len(raw_text.split())<=5:
            continue
        try:
            is_retweet=bool(is_retweet)
        except:
            is_retweet=False
        try:
            retweet_count = int(retweet_count)
        except:
            retweet_count = -1
        # convert url to http    
        if raw_text.split()[-1].startswith('http'):
            raw_text = raw_text.split('http')[0] + 'http'
        raw_text=unidecode.unidecode(raw_text)
        tweet = bk.Tweet(ID,raw_text,created_at,is_retweet,retweet_count,hash_tags,pop=1)
        tweetsObj[ID] = tweet
    t.close()
Example #3
0
def getRelTweets(newsID, dtpure, tweetPre, tweetIDset, tweetSet):
    #n = News.objects.filter(ID=newsID)
    #if n.count() > 0:
    #    #return News.objects.get(ID=newsID).tweet_set.all()
    #    return list(n[0].tweet_set.all())
    #else:
    #    return []
    t_path = glob.glob(tweetPre + dtpure + "/" + str(newsID) + "_*")
    if len(t_path) != 1:
        print('no tweets for news ', newsID, 'len(t_path)', len(t_path))
        return ([], [])
    if os.path.exists(t_path[0]):
        t = codecs.open(t_path[0], encoding='utf-8')

    #tweets = set()
    tweets = []
    tweetsObj = []
    # stupid redundancy
    for line in t:
        fields = line.strip().split("\t")
        if len(fields) < 24:
            #    tweets_log.write("not 27:"+line.strip()+"\n")
            continue
        ID, raw_text, created_at, contained_url, hash_tags, retw_id_str, retw_favorited, retw_favorite_count, is_retweet, retweet_count, \
        tw_favorited, tw_favorite_count, tw_retweeted, tw_retweet_count, user_id_str, verified, follower_count, statuses_count, friends_count, \
    favorites_count, user_created_at= fields[:21]
        try:
            ID = int(ID)
        except:
            continue
        try:
            is_retweet = bool(is_retweet)
        except:
            is_retweet = False
        try:
            retweet_count = int(retweet_count)
        except:
            retweet_count = -1
        #if len(tag_text) > 100:
        #    tweets_log.write("hashtag too long: "+line.strip()+"\n")
        #    continue
        #if len(tw_text) > 200:
        #    tweets_log.write("tweet too long: "+line.strip()+"\n")
        #    continue
        #
        ## convert user_created time
        ## Fri Nov 07 22:20:38 +0000 2014
        #tw_created_at_tz = parse(tw_created_at) # utc time with tz information
        #tw_local_timezone = tw_created_at[len(tw_created_at)-10:len(tw_created_at)-5] # +0000

#        tweets.append(tw_text)
#s = tw_text.find("http://")
#if s == -1:
#    s = tw_text.find("https://")
#if s != -1:
#    tmp = tw_text[s:]
#    e = tmp.find(" ")
#    if e == -1:
#        e = len(tmp)
#    tw_text = (tw_text[:s].strip()+ " " + tmp[e:].strip()).strip()
#if s != 0 and tw_text[s-1] != " " and tw_text[s-1] != "\t":
#    tw_text = tw_text[:s] + tmp[e:]
#else:
#    tw_text = tw_text[:s] + tmp[e+1:]
# remove url
#tw_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tw_text)
        if "http" not in raw_text and "RT @" not in raw_text \
            and ID not in tweetIDset and raw_text not in tweetSet:
            tweet = bk.Tweet(ID, raw_text, created_at, is_retweet,
                             retweet_count, hash_tags)
            tweetsObj.append(tweet)
            tweets.append(raw_text)
            tweetIDset.add(ID)
            tweetSet.add(raw_text)
#        tweets.add(tw_text)
#tweet = Tweet(ID=int(tw_id_str), user=int(user_id_str) ,raw_text = tw_text,created_at = tw_created_at_tz, local_time_zone = tw_local_timezone, retweet_count = tw_retweet_count,\
#hash_tags = tag_text)
    t.close()
    #    if tweets:
    return (tweets, tweetsObj)