def updateGrapWith(tweet): #convert time to iso format, escape non-standard TZ-offset, assume UTC. tweetTime = dt.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if len(tweet['entities']['hashtags']) > 1: #remove unicode and upper/lower-case equivalents tweetTags = [replaceControlChars(tag['text'].encode('ascii','ignore')).lower() for tag in tweet['entities']['hashtags']] tweetTags = set(tweetTags) tweet = { 'time': tweetTime, 'hashtags': tweetTags } else: tweet = { 'time': tweetTime, 'hashtags': [] } if len(tweet['hashtags']) < 2: tweet = { 'time': tweetTime } tweetWindow.append(tweet) else: tweetWindow.append(tweet) for tag in tweetTags: if not tag in graph: graph[tag] = [] adjacentTags = [adjTag for adjTag in tweetTags if (adjTag != tag)] graph[tag].extend(adjacentTags) #the same adjacency can exists multiple times for tracking purposes while ((tweetTime - tweetWindow[0]["time"]).total_seconds() > 60): evictedTweet = tweetWindow.popleft() if 'hashtags' in evictedTweet: adjacenciesToRemove = evictedTweet['hashtags'] for tag in adjacenciesToRemove: tagsCopy = set(adjacenciesToRemove) tagsCopy.remove(tag) for toRemove in tagsCopy: graph[tag].remove(toRemove) if not graph[tag]: del graph[tag] return graph
def getTextAndTimestamp(tweet): cleanedText = replaceControlChars(tweet["text"].encode("ascii", "ignore")) time = tweet["created_at"] if tweet["text"] != cleanedText: getTextAndTimestamp.tweetsWithUnicode += 1 return "%s (timestamp: %s)" % (cleanedText, time)