Example #1
0
    def window (self, tweets_list):
        """This function takes list of tweets and return single window and relevant metrics - list of Top Scores, dictionary
         of tracker consisting of Top scores for each anchor tweet and the corresponding number of the top scores"""
        self.tweets_list = tweets_list
        frame = {} # initialise empty frame to store all window instances and associated values
        for r in range(5):
            frame['Window_'+str(r)] = {'Top Scores':[],'Tracker':[]} # for each window store its top scores, tuple of top scores and indices
            anchor_index = 0
            other_index = 1
            while other_index < len(tweets_list)-1:
                discarded_pair = 0 # keeps track of number of pairs with simialrity of 0.0 or 1.0
                retained_pair = 0 # keeps track of number of pairs greater than 0.0 and less than 1.0
                for anchor_tweet, other_tweet in zipper(tweets_list[anchor_index:], tweets_list[other_index:]):
                    tweet_pair = [] #  only stores the pair of tweets for simialrity computation. This is being created and deleted continously
                    if other_tweet in tweets_list[1:][-1]: # avoids TypeError on reaching the final tweet in the list
                        break
                    anchor_tweet = tweets_list[anchor_index] # make the anchor tweet constant for each iteration in the window
                    tweet_pair.append(anchor_tweet), tweet_pair.append(other_tweet)# stores pairs for simialrity computation only
                    vectorizer = TfidfVectorizer(lowercase = False)
                    vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme
                    cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2)
                    if cosim[0] == 0.0 or cosim[0] == 1.0:
                        discarded_pair +=1
                        continue
                    frame['Window_'+str(r)]['Top Scores'].append(cosim[0]) # updtae the frame data structure with the anchor tweet and
                    retained_pair +=1
                frame['Window_'+str(r)]['Tracker'].append((anchor_index, retained_pair)) # updtae the frame data structure with the anchor tweet and
                anchor_index +=1 # pick the next tweet as the next anchor
                other_index +=1 # shrink the window size by a factor of 1

        return window
Example #2
0
 def tweets_sim1(self, tweets_list):
     # THIS FUNCTION IS SIMILAR TO THE window function below:
     """The following function computes the pairwise similarity between paair of random tweets using cosine similarityself.
     This function dffers from the the above (tweet_sim) as it considers all the possible anchors in the given window. for
     instance, a window of size 11 will have 10 possible anchors, hence 10 different windows of disproportionate sizes"""
     self.tweets_list = tweets_list
     anchor_index = 0
     other_index = 1
     all_similarity_scores = []
     tracker = []
     while other_index < len(tweets_list)-1:
         #anchor = anchor
         similarity_scores = [] # #keeps track of similarity scores > 0.0 and < 1.0
         discarded = 0 # keeps tract of pairs with simialrity of 0.0 or 1.0
         for anchor_tweet, other_tweet in zipper(tweets_list[anchor_index:], tweets_list[other_index:]):
             tweet_pair = [] #  only stores the pair of tweets for simialrity computation. This is being created and deleted continously
             if other_tweet in tweets_list[1:][-1]: # avoids TypeError on reaching the final tweet in the list
                 break
             anchor_tweet = tweets_list[anchor_index] # make the anchor tweet constant for each iteration in the window
             tweet_pair.append(anchor_tweet), tweet_pair.append(other_tweet)# stores pairs for simialrity computation only
             vectorizer = TfidfVectorizer(lowercase = False)
             vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme
             cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2)
             if cosim[0] == 0.0 or cosim[0] == 1.0:
                 discarded +=1
                 continue
             similarity_scores.append(cosim[0])
             all_similarity_scores.append(cosim[0])
         tracker.append((anchor_index, len(similarity_scores))) # keep tracks of the anchor index and the number of similar tweets to it in the window
         anchor_index +=1
         other_index +=1
     return all_similarity_scores, tracker
Example #3
0
 def frame(self, tweets_list, frame_size = 3):
     """This function takes list of tweets and return set of windows. A frame is a unit of computation that can take very
     large file, break it into finite windows and return the associated metrics in each window. Each window consist of list of Top Scores
     and a dictionary of tracker consisting of Top scores for each anchor tweet and the corresponding number of the top scores
     The frame also contains top scores and corresponding indices in each window"""
     self.tweets_list = tweets_list # list of tweets to be broken into n windows
     self.frame_size = frame_size # number of windows in each frame
     window_size = int(len(tweets_list)/frame_size) # size of each widnow in the frame
     window_tweets = np.array_split(tweets_list, frame_size) # split tweets into window tweets according to the frame size, default is 3
     frame = {} # initialise empty frame to store all window instances and associated values
     k = 0 # initialise stopping criteria
     while k < frame_size:
         for window, window_tweet in zip(range(window_size),window_tweets):
             frame['Window_'+str(window)] = {'All Scores':[],'Counter':[], 'Score Tracker':[], 'Top Scores':[]} # for each window store its top scores, tuple of top scores and indices
             anchor_index = 0
             other_index = 1
             while other_index < len(window_tweet)-1:
                 discarded_pair = 0 # keeps track of number of pairs with simialrity of 0.0 or 1.0
                 retained_pair = 0 # keeps track of number of pairs greater than 0.0 and less than 1.0
                 tracked_index = 2# track all other indices
                 for anchor_tweet, other_tweet in zipper(window_tweet[anchor_index:], window_tweet[other_index:]):
                     tweet_pair = [] #  only stores the pair of tweets for simialrity computation. This is being created and deleted continously
                     if other_tweet in window_tweet[1:][-1]: # avoids TypeError on reaching the final tweet in the list
                         break
                     anchor_tweet = window_tweet[anchor_index] # make the anchor tweet constant for each iteration in the window
                     tweet_pair.append(anchor_tweet[1]), tweet_pair.append(other_tweet[1])# stores pairs for simialrity computation only
                     vectorizer = TfidfVectorizer(max_features= 3,lowercase = False)
                     vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme
                     cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2)
                     if cosim[0] == 0.0 or cosim[0] == 1.0:
                         discarded_pair +=1
                     elif cosim[0] > 0.85:
                         #try:# avoid posting times with some commas
                         frame['Window_'+str(window)]['Top Scores'].append(((anchor_index,tracked_index),(anchor_tweet[0],other_tweet[0]),cosim[0]))
                         #except:
                         #    continue
                         #frame['Window_'+str(window)]['Top Scores'].append(((anchor_index,tracked_index),(anchor_tweet[0],other_tweet[0]),cosim[0])) # update top scores wrt to the anchor
                     else:# anchor_tweet[0] and other_tweet[0] refers to the posting times used in computing relative posting time
                         #try:
                         frame['Window_'+str(window)]['All Scores'].append(((anchor_index,tracked_index), (anchor_tweet[0],other_tweet[0]),cosim[0])) # updtae the frame data structure with the anchor tweet and
                         #except:
                         #    continue
                         retained_pair +=1
                     tracked_index+=1 # update the index of the inner tweet being compared with the anchor
                 frame['Window_'+str(window)]['Counter'].append((anchor_index, retained_pair)) # updtae the frame data structure with the anchor tweet and
                 # update the score Tracker
                 """q = np.array(frame['Window_'+str(window)]['All Scores']) # convert list of scores in pos. in the tuple to np array for computational ease
                 while q.shape[0]>1: # execute as long as length of the array is > 1
                     frame['Window_'+str(window)]['Score Tracker'].append((q.max(),q.argmax())) # track top scores and their indices in windows
                     q = np.delete(q, q.argmax())""" #  delete used score and index
                 # update indices of anchor and other tweets
                 anchor_index +=1 # pick the next tweet as the next anchor
                 other_index +=1 # shrink the window size by a factor of 1
         # update stopping criteria:
         k+=1
     return frame
Example #4
0
 def tweets_sim(self, tweets_list):
     """The following function computes the pairwise similarity between paair of random tweets using cosine similarity"""
     self.tweets_list = tweets_list
     similarity_scores = [] # #keeps track of similarity scores > 0.0 and < 1.0
     discarded = 0 # keeps tract of pairs with simialrity of 0.0 or 1.0
     for anchor_tweet, other_tweet in zipper(tweets_list[1:], tweets_list[2:]):
         tweet_pair = [] #  only stores the pair of tweets for simialrity computation. This is being created and deleted continously
         if other_tweet in tweets_list[2:][-1]: # avoids TypeError on reaching the final tweet in the list
             break
         anchor_tweet = tweets_list[1] # make the anchor tweet constant for each iteration in the window
         tweet_pair.append(anchor_tweet), tweet_pair.append(other_tweet)# stores pairs for simialrity computation only
         vectorizer = TfidfVectorizer(lowercase = False)
         vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme
         cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2)
         if cosim[0] == 0.0 or cosim[0] == 1.0:
             discarded +=1
             continue
         similarity_scores.append(cosim[0])
     return similarity_scores
Example #5
0
    def get_tweet_status(self, data_tuple):
        """This function takes list of tweets and return set of windows. A frame is a unit of computation that can take very
        large file, break it into finite windows and return the associated metrics in each window. Each window consist of list of Top Scores
        and a dictionary of tracker consisting of Top scores for each anchor tweet and the corresponding number of the top scores
        The frame also contains top scores and corresponding indices in each window"""

        self.data_tuple = data_tuple
        #self.output_file = output_file

        frame_size = 10
        window_size = int(len(data_tuple)/frame_size) # size of each widnow in the frame
        window_tweets = np.array_split(data_tuple, frame_size) # split tweets into window tweets according to the frame size, default is 3
        frame = {} # initialise empty frame to store all window instances and associated values
        dfs = pd.DataFrame()
        k = 0 # initialise stopping criteria
        while k < frame_size:
            for window, window_tweet in zip(range(window_size),window_tweets):
                frame['Window_'+str(window)] = {'AnchorID':[],'AnchorTweet':[],'OtherTweet':[],'PostingTimes':[], 'RelativeTime':[],\
                'PairIndices':[],'CoSim':[],'Discards':[],'FavCount':[],'FolCount':[],'Friends':[],'Period':[]}#,'Counter':[]}
                anchor_index = 0
                other_index = 1
                while other_index < len(window_tweet)-1:
                    discarded_pair = 0 # keeps track of number of pairs with simialrity of 0.0 or 1.0
                    retained_pair = 0 # keeps track of number of pairs greater than 0.0 and less than 1.0
                    tracked_index = 2# track all other indices
                    for anchor_tweet, other_tweet in zipper(window_tweet[anchor_index:], window_tweet[other_index:]):
                        try:
                            tweet_pair = [] #  only stores the pair of tweets for simialrity computation. This is being created and deleted continously
                            anchor_tweet = window_tweet[anchor_index] # make the anchor tweet constant for each iteration in the window
                            tweet_pair.append(anchor_tweet[1]), tweet_pair.append(other_tweet[1])# stores pairs for simialrity computation only
                            vectorizer = TfidfVectorizer(max_features= 3,lowercase = False)
                            vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme
                            cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2)
                            if cosim[0] == 0.0 or cosim[0] == 1.0:
                                discarded_pair +=1 # keep track of less important scores
                                continue
                            frame['Window_'+str(window)]['AnchorID'].append('A_'+str(anchor_index)) # stores the id of the anchor
                            frame['Window_'+str(window)]['AnchorTweet'].append(anchor_tweet[1]) # stores the anchor tweet
                            frame['Window_'+str(window)]['OtherTweet'].append(other_tweet[1]) # stores the tweet to compare with
                            frame['Window_'+str(window)]['PostingTimes'].append((anchor_tweet[0],other_tweet[0])) # stores posting times of the pair
                            frame['Window_'+str(window)]['RelativeTime'].append((int(other_tweet[0])-int(anchor_tweet[0]))) # relative posting time/time difference
                            frame['Window_'+str(window)]['PairIndices'].append((anchor_index,tracked_index)) # stores the indices of the pair
                            frame['Window_'+str(window)]['CoSim'].append(cosim[0]) # stores the indices of the pair
                            frame['Window_'+str(window)]['Discards'].append(discarded_pair) # stores the indices of the pair
                            frame['Window_'+str(window)]['FavCount'].append(anchor_tweet[3]) # stores the favourite counts history
                            frame['Window_'+str(window)]['FolCount'].append(anchor_tweet[4]) # stores the number of followers
                            frame['Window_'+str(window)]['Friends'].append(anchor_tweet[5]) # stores the number of friends ... specific time of the day also needed
                            frame['Window_'+str(window)]['Period'].append(anchor_tweet[2]) # period of the day, e.g. morning or afternoon
                            retained_pair +=1
                            tracked_index+=1 # update the index of the inner tweet being compared with the anchor
                        except:
                            continue

                    anchor_index +=1 # pick the next tweet as the next anchor
                    other_index +=1 # shrink the window size by a factor of 1
            k+=1 # update stopping criteria:
            dfs = pd.DataFrame() # instantiate empty dataframe to store all windows in the frame ...
            for key in frame.keys(): # update the dataframe ....
                df = pd.DataFrame(frame[key])
                dfs = dfs.append(df)
        return dfs