def window (self, tweets_list): """This function takes list of tweets and return single window and relevant metrics - list of Top Scores, dictionary of tracker consisting of Top scores for each anchor tweet and the corresponding number of the top scores""" self.tweets_list = tweets_list frame = {} # initialise empty frame to store all window instances and associated values for r in range(5): frame['Window_'+str(r)] = {'Top Scores':[],'Tracker':[]} # for each window store its top scores, tuple of top scores and indices anchor_index = 0 other_index = 1 while other_index < len(tweets_list)-1: discarded_pair = 0 # keeps track of number of pairs with simialrity of 0.0 or 1.0 retained_pair = 0 # keeps track of number of pairs greater than 0.0 and less than 1.0 for anchor_tweet, other_tweet in zipper(tweets_list[anchor_index:], tweets_list[other_index:]): tweet_pair = [] # only stores the pair of tweets for simialrity computation. This is being created and deleted continously if other_tweet in tweets_list[1:][-1]: # avoids TypeError on reaching the final tweet in the list break anchor_tweet = tweets_list[anchor_index] # make the anchor tweet constant for each iteration in the window tweet_pair.append(anchor_tweet), tweet_pair.append(other_tweet)# stores pairs for simialrity computation only vectorizer = TfidfVectorizer(lowercase = False) vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2) if cosim[0] == 0.0 or cosim[0] == 1.0: discarded_pair +=1 continue frame['Window_'+str(r)]['Top Scores'].append(cosim[0]) # updtae the frame data structure with the anchor tweet and retained_pair +=1 frame['Window_'+str(r)]['Tracker'].append((anchor_index, retained_pair)) # updtae the frame data structure with the anchor tweet and anchor_index +=1 # pick the next tweet as the next anchor other_index +=1 # shrink the window size by a factor of 1 return window
def tweets_sim1(self, tweets_list): # THIS FUNCTION IS SIMILAR TO THE window function below: """The following function computes the pairwise similarity between paair of random tweets using cosine similarityself. This function dffers from the the above (tweet_sim) as it considers all the possible anchors in the given window. for instance, a window of size 11 will have 10 possible anchors, hence 10 different windows of disproportionate sizes""" self.tweets_list = tweets_list anchor_index = 0 other_index = 1 all_similarity_scores = [] tracker = [] while other_index < len(tweets_list)-1: #anchor = anchor similarity_scores = [] # #keeps track of similarity scores > 0.0 and < 1.0 discarded = 0 # keeps tract of pairs with simialrity of 0.0 or 1.0 for anchor_tweet, other_tweet in zipper(tweets_list[anchor_index:], tweets_list[other_index:]): tweet_pair = [] # only stores the pair of tweets for simialrity computation. This is being created and deleted continously if other_tweet in tweets_list[1:][-1]: # avoids TypeError on reaching the final tweet in the list break anchor_tweet = tweets_list[anchor_index] # make the anchor tweet constant for each iteration in the window tweet_pair.append(anchor_tweet), tweet_pair.append(other_tweet)# stores pairs for simialrity computation only vectorizer = TfidfVectorizer(lowercase = False) vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2) if cosim[0] == 0.0 or cosim[0] == 1.0: discarded +=1 continue similarity_scores.append(cosim[0]) all_similarity_scores.append(cosim[0]) tracker.append((anchor_index, len(similarity_scores))) # keep tracks of the anchor index and the number of similar tweets to it in the window anchor_index +=1 other_index +=1 return all_similarity_scores, tracker
def frame(self, tweets_list, frame_size = 3): """This function takes list of tweets and return set of windows. A frame is a unit of computation that can take very large file, break it into finite windows and return the associated metrics in each window. Each window consist of list of Top Scores and a dictionary of tracker consisting of Top scores for each anchor tweet and the corresponding number of the top scores The frame also contains top scores and corresponding indices in each window""" self.tweets_list = tweets_list # list of tweets to be broken into n windows self.frame_size = frame_size # number of windows in each frame window_size = int(len(tweets_list)/frame_size) # size of each widnow in the frame window_tweets = np.array_split(tweets_list, frame_size) # split tweets into window tweets according to the frame size, default is 3 frame = {} # initialise empty frame to store all window instances and associated values k = 0 # initialise stopping criteria while k < frame_size: for window, window_tweet in zip(range(window_size),window_tweets): frame['Window_'+str(window)] = {'All Scores':[],'Counter':[], 'Score Tracker':[], 'Top Scores':[]} # for each window store its top scores, tuple of top scores and indices anchor_index = 0 other_index = 1 while other_index < len(window_tweet)-1: discarded_pair = 0 # keeps track of number of pairs with simialrity of 0.0 or 1.0 retained_pair = 0 # keeps track of number of pairs greater than 0.0 and less than 1.0 tracked_index = 2# track all other indices for anchor_tweet, other_tweet in zipper(window_tweet[anchor_index:], window_tweet[other_index:]): tweet_pair = [] # only stores the pair of tweets for simialrity computation. This is being created and deleted continously if other_tweet in window_tweet[1:][-1]: # avoids TypeError on reaching the final tweet in the list break anchor_tweet = window_tweet[anchor_index] # make the anchor tweet constant for each iteration in the window tweet_pair.append(anchor_tweet[1]), tweet_pair.append(other_tweet[1])# stores pairs for simialrity computation only vectorizer = TfidfVectorizer(max_features= 3,lowercase = False) vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2) if cosim[0] == 0.0 or cosim[0] == 1.0: discarded_pair +=1 elif cosim[0] > 0.85: #try:# avoid posting times with some commas frame['Window_'+str(window)]['Top Scores'].append(((anchor_index,tracked_index),(anchor_tweet[0],other_tweet[0]),cosim[0])) #except: # continue #frame['Window_'+str(window)]['Top Scores'].append(((anchor_index,tracked_index),(anchor_tweet[0],other_tweet[0]),cosim[0])) # update top scores wrt to the anchor else:# anchor_tweet[0] and other_tweet[0] refers to the posting times used in computing relative posting time #try: frame['Window_'+str(window)]['All Scores'].append(((anchor_index,tracked_index), (anchor_tweet[0],other_tweet[0]),cosim[0])) # updtae the frame data structure with the anchor tweet and #except: # continue retained_pair +=1 tracked_index+=1 # update the index of the inner tweet being compared with the anchor frame['Window_'+str(window)]['Counter'].append((anchor_index, retained_pair)) # updtae the frame data structure with the anchor tweet and # update the score Tracker """q = np.array(frame['Window_'+str(window)]['All Scores']) # convert list of scores in pos. in the tuple to np array for computational ease while q.shape[0]>1: # execute as long as length of the array is > 1 frame['Window_'+str(window)]['Score Tracker'].append((q.max(),q.argmax())) # track top scores and their indices in windows q = np.delete(q, q.argmax())""" # delete used score and index # update indices of anchor and other tweets anchor_index +=1 # pick the next tweet as the next anchor other_index +=1 # shrink the window size by a factor of 1 # update stopping criteria: k+=1 return frame
def tweets_sim(self, tweets_list): """The following function computes the pairwise similarity between paair of random tweets using cosine similarity""" self.tweets_list = tweets_list similarity_scores = [] # #keeps track of similarity scores > 0.0 and < 1.0 discarded = 0 # keeps tract of pairs with simialrity of 0.0 or 1.0 for anchor_tweet, other_tweet in zipper(tweets_list[1:], tweets_list[2:]): tweet_pair = [] # only stores the pair of tweets for simialrity computation. This is being created and deleted continously if other_tweet in tweets_list[2:][-1]: # avoids TypeError on reaching the final tweet in the list break anchor_tweet = tweets_list[1] # make the anchor tweet constant for each iteration in the window tweet_pair.append(anchor_tweet), tweet_pair.append(other_tweet)# stores pairs for simialrity computation only vectorizer = TfidfVectorizer(lowercase = False) vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2) if cosim[0] == 0.0 or cosim[0] == 1.0: discarded +=1 continue similarity_scores.append(cosim[0]) return similarity_scores
def get_tweet_status(self, data_tuple): """This function takes list of tweets and return set of windows. A frame is a unit of computation that can take very large file, break it into finite windows and return the associated metrics in each window. Each window consist of list of Top Scores and a dictionary of tracker consisting of Top scores for each anchor tweet and the corresponding number of the top scores The frame also contains top scores and corresponding indices in each window""" self.data_tuple = data_tuple #self.output_file = output_file frame_size = 10 window_size = int(len(data_tuple)/frame_size) # size of each widnow in the frame window_tweets = np.array_split(data_tuple, frame_size) # split tweets into window tweets according to the frame size, default is 3 frame = {} # initialise empty frame to store all window instances and associated values dfs = pd.DataFrame() k = 0 # initialise stopping criteria while k < frame_size: for window, window_tweet in zip(range(window_size),window_tweets): frame['Window_'+str(window)] = {'AnchorID':[],'AnchorTweet':[],'OtherTweet':[],'PostingTimes':[], 'RelativeTime':[],\ 'PairIndices':[],'CoSim':[],'Discards':[],'FavCount':[],'FolCount':[],'Friends':[],'Period':[]}#,'Counter':[]} anchor_index = 0 other_index = 1 while other_index < len(window_tweet)-1: discarded_pair = 0 # keeps track of number of pairs with simialrity of 0.0 or 1.0 retained_pair = 0 # keeps track of number of pairs greater than 0.0 and less than 1.0 tracked_index = 2# track all other indices for anchor_tweet, other_tweet in zipper(window_tweet[anchor_index:], window_tweet[other_index:]): try: tweet_pair = [] # only stores the pair of tweets for simialrity computation. This is being created and deleted continously anchor_tweet = window_tweet[anchor_index] # make the anchor tweet constant for each iteration in the window tweet_pair.append(anchor_tweet[1]), tweet_pair.append(other_tweet[1])# stores pairs for simialrity computation only vectorizer = TfidfVectorizer(max_features= 3,lowercase = False) vectorised_pair = vectorizer.fit_transform(tweet_pair) # convert tweet_pair to numeric using tfidf scheme cosim = np.round(cosine_similarity(vectorised_pair.toarray()[0].reshape(1,-1), vectorised_pair.toarray()[1].reshape(1,-1)).flatten(),2) if cosim[0] == 0.0 or cosim[0] == 1.0: discarded_pair +=1 # keep track of less important scores continue frame['Window_'+str(window)]['AnchorID'].append('A_'+str(anchor_index)) # stores the id of the anchor frame['Window_'+str(window)]['AnchorTweet'].append(anchor_tweet[1]) # stores the anchor tweet frame['Window_'+str(window)]['OtherTweet'].append(other_tweet[1]) # stores the tweet to compare with frame['Window_'+str(window)]['PostingTimes'].append((anchor_tweet[0],other_tweet[0])) # stores posting times of the pair frame['Window_'+str(window)]['RelativeTime'].append((int(other_tweet[0])-int(anchor_tweet[0]))) # relative posting time/time difference frame['Window_'+str(window)]['PairIndices'].append((anchor_index,tracked_index)) # stores the indices of the pair frame['Window_'+str(window)]['CoSim'].append(cosim[0]) # stores the indices of the pair frame['Window_'+str(window)]['Discards'].append(discarded_pair) # stores the indices of the pair frame['Window_'+str(window)]['FavCount'].append(anchor_tweet[3]) # stores the favourite counts history frame['Window_'+str(window)]['FolCount'].append(anchor_tweet[4]) # stores the number of followers frame['Window_'+str(window)]['Friends'].append(anchor_tweet[5]) # stores the number of friends ... specific time of the day also needed frame['Window_'+str(window)]['Period'].append(anchor_tweet[2]) # period of the day, e.g. morning or afternoon retained_pair +=1 tracked_index+=1 # update the index of the inner tweet being compared with the anchor except: continue anchor_index +=1 # pick the next tweet as the next anchor other_index +=1 # shrink the window size by a factor of 1 k+=1 # update stopping criteria: dfs = pd.DataFrame() # instantiate empty dataframe to store all windows in the frame ... for key in frame.keys(): # update the dataframe .... df = pd.DataFrame(frame[key]) dfs = dfs.append(df) return dfs