def hotMaterials(request): if request.method == "GET": pageID = request.GET["pageID"] helper = Helper() result = { "pageID" : pageID, "hotMaterials" : helper.getHotMaterials(pageID), } #print result #return HttpResponse(json.dumps(queryResults), mimetype="application/json") return helper.jsonp(request, result)
def getEvents(request, topic): # print "topic = ", topic # con = Connection() # db = con['cs3281'] # col = db['trending_events'] # events = [] # for r in col.find({'term':str(topic)}): # events.append(r['events']) helper = Helper() import codecs, re con = MongoClient('{username}:{password}@{host}'.format(**settings.MONGO_CONNECTION)) db = con['cs3281'] wikiCol = db['wiki_article'] wordPattern = re.compile(r'[\w]+') title_file = './wikiTitle.txt' #print "I am here!" wordSet = set() for doc in wikiCol.find(): if 'pageTitle' in doc: title = doc['pageTitle'] wordlist = re.findall(wordPattern, title.lower()) for word in wordlist: if len(word) > 3 and len(word) < 20: wordSet.add(word) fwp = codecs.open(title_file, 'w', 'utf-8') for word in wordSet: fwp.write( "%s\n" % (word) ) fwp.close() import datetime import codecs, re import wiki ISOTIMEFORMAT = '%Y-%m-%d' wikititle_file = './wikiTitle.txt' tweet_file = './tweet.txt' #min_time = datetime.datetime.max #max_time = datetime.datetime.min titleSet = set() # The set of titles of wiki articles wordSet = set() # To keep all the keywords in the tweet collection wordPattern = re.compile(r'[\w]+') def loadTitleSet(): fp = codecs.open(wikititle_file, 'r', 'utf-8') for line in fp.readlines(): title = line.strip() titleSet.add( title ) fp.close() def processTweets(): global min_time, max_time con = Connection() db = con['cs3281'] bef = datetime.date.today()-datetime.timedelta(days=5) tweetCol = db['tweets'] fwp = open(tweet_file, 'w') cnt = 0 topic_page_id = wiki.page(title = topic).pageid for doc in tweetCol.find({'pageID': topic_page_id, 'createdAt': {'$lt': datetime.datetime(datetime.date.today().year, datetime.date.today().month, datetime.date.today().day, 23, 59, 59), '$gte': datetime.datetime(bef.year, bef.month, bef.day, 0, 0, 0)}}): #print doc if 'createdAt' not in doc or 'text' not in doc: continue create_time = doc['createdAt'] min_time = create_time if create_time < min_time else min_time max_time = create_time if create_time > max_time else max_time timeString = create_time.strftime(ISOTIMEFORMAT) fwp.write(timeString) text = doc['text'] wordlist = re.findall(wordPattern, text.lower()) for word in wordlist: #if word in titleSet: wordSet.add( word ) fwp.write(" %s" % word) fwp.write("\n") cnt += 1 if cnt % 10000 == 0: print cnt fwp.close() def writeWordSet(): wordList = list( wordSet ) wordList.sort() fp = open('TweetWords.txt', 'w') for word in wordList: fp.write("%s\n" % word ) fp.close() def run(): loadTitleSet() processTweets() writeWordSet() #print min_time, max_time run() # Find relevant words for each term from datetime import datetime import string, time, operator, re, nltk import pprint class TopEvents: ISOTIMEFORMAT = '%Y-%m-%d' start_time = time.strptime('2014-10-15', ISOTIMEFORMAT) #start time of the twitter data set day_number = int((time.mktime(datetime.today().timetuple()) - time.mktime(start_time)) / 86400) #slice by day time_pattern = re.compile(r'\d{4}-\d{2}-\d{2}') term_pattern = re.compile(r'[a-zA-Z]+') stopwords = set() day_cnt_list = [] day_doc_list = [] day_weight_list = [] def __init__(self, stopword_file): with open(stopword_file, 'r') as fp: for line in fp: self.stopwords.add( line.strip() ) def get_top_K_events(self, tweets, alpha, K, N): self.initialize_days() self.update_days( tweets ) top_events = self.rank_events(alpha, K) #K: number of events #print "top_events = ", top_events result = [] #pprint.pprint(self.day_doc_list) for (day, weight) in top_events: freqdist = self.day_doc_list[day] #print "freqdist for day ", day, " = ", freqdist if len( freqdist.items() ) == 0: continue word_list = [] #for word, num in freqdist.items()[:N]: # N:number of words for each event for word, num in freqdist.most_common(N): word_list.append( ( word, num) ) actual_date = datetime.date(2014, 10, 15) + datetime.timedelta(days=day) actual_date_string = actual_date.strftime("%Y-%m-%d") result.append( (actual_date_string, weight, word_list) ) #pirint "result = ", result return sorted(result, key = get_date, reverse = True) def initialize_days(self): self.day_number = int((time.mktime(datetime.today().timetuple()) - time.mktime(self.start_time)) / 86400) self.day_number = self.day_number + 1 self.day_cnt_list = [0] * self.day_number self.day_doc_list = [nltk.FreqDist() for i in xrange(self.day_number)] self.day_weight_list = [0] * self.day_number def update_days(self, tweets): #print tweets for tweet in tweets: #print tweet time_string = re.search(self.time_pattern, tweet).group() post_time = time.strptime(time_string, self.ISOTIMEFORMAT) day = int((time.mktime(post_time) - time.mktime(self.start_time)) / 86400) term_list = self.get_terms( tweet ) #print "day = ", day self.day_cnt_list[ day ] += 1 self.day_doc_list[ day ].update( term_list ) def get_terms(self, tweet): term_list = re.findall(self.term_pattern, tweet.lower()) return [term for term in term_list if len(term) > 3 and len(term) < 20 and term not in self.stopwords] def rank_events(self, alpha, K): maxBurst = self.findMaxBurst() #print "maxBurst = ", maxBurst maxDay = self.getMaxDay() #print "maxDay = ", maxDay sort_list = [] for day in xrange(self.day_number): weight = self.get_weight( day, maxBurst, maxDay, alpha) sort_list.append( (day, weight) ) #print "sort_list when day = ", day, " = ", sort_list sort_list.sort( key = operator.itemgetter(1), reverse = True ) #descending order of weight return sort_list[:K] def findMaxBurst(self): maxBurst = 0 for burst in self.day_cnt_list: if burst > maxBurst: maxBurst = burst return maxBurst def getMaxDay(self): return self.day_number def get_weight(self, day, maxBurst, maxDay, alpha): burst = self.day_cnt_list[day] return alpha*burst/maxBurst + (1-alpha)*day/maxDay tweets = [] with open('tweet.txt', 'r') as fp: for line in fp: tweets.append( line.strip() ) te = TopEvents('stopwords') return helper.jsonp(request, te.get_top_K_events(tweets, 0.5, 10, 10))
def search_with_tokens(tokens, queryType, params): #print "params = ", params #user = UserSocialAuth.objects.filter(user=request.user).get() # Set up a new Twython object with the OAuth keys #pprint(tokens) print "app_key = ", settings.TWITTER_CONSUMER_KEY print "app_secret = ", settings.TWITTER_CONSUMER_SECRET print "oauth_token = ", tokens['oauth_token'] print "oauth_token_secret = ", tokens['oauth_token_secret'] t = Twython(app_key=settings.TWITTER_CONSUMER_KEY, app_secret=settings.TWITTER_CONSUMER_SECRET, oauth_token=tokens['oauth_token'], oauth_token_secret=tokens['oauth_token_secret']) # Obtain Twitter results using user's OAuth key if queryType == "search": params = params.copy() pageID = params["pageID"] del params["pageID"] queryResults = t.search(**(params)) # Use helper to update the tweet list of the page # Return an array of tweets to return to the client helper = Helper() # tweets = helper.updateWikiArticleTweet(pageID, queryResults) # Update the correct pageID helper.updateWikiArticleTweet(pageID, queryResults) # Update the correct pageID ''' tweetsSerialization = [] for tweet in tweets: tweetsSerialization.append({ "id" : tweet.id, "text" : tweet.text, "source" : tweet.source, "profileImageUrl" : tweet.profileImageUrl, "createdAt" : tweet.createdAt }) ''' # Get the correct logger based on convention: # {APP}.{FILE}.{FUNCTION} = __name__.'.'.inspect.stack()[0][3] # Need to evaulate performance of 'inspect' library as print function name # is not supported by Python #print __name__+"."+inspect.stack()[0][3] logger = logging.getLogger(__name__+"."+inspect.stack()[0][3]) # Fake infomation, to be updated later logger.info(json.dumps({ "pageID": pageID, # Pass the correct pageID here "queryType": queryType, "query": queryResults["search_metadata"]["query"], "tweetsCount": queryResults["search_metadata"]["count"], "source": 'TwitterAPI', })) return HttpResponse(json.dumps(queryResults), content_type="application/json") ## elif queryType == "articleSearch": ## params1 = dict(params) ## # TODO, swap this out later ## params1["q"] = unicode(params["articleId"]) ## del params1["articleId"] ## queryResults = t.search(**(params1)) elif queryType == "followersID": queryResults = t.getFollowersIDs(**(params)) elif queryType == "followersList": params = params.copy() screen_name = params["screen_name"] cursor = params["cursor"] queryResults = t.getFollowersList(**(params)) # Save results helper = Helper() helper.getUserFollower(screen_name, queryResults) elif queryType == "friendsID": queryResults = t.getFriendsIDs(**(params)) elif queryType == "friendshipLookup": queryResults = t.lookupFriendships(**(params)) elif queryType == "stream": queryResults = t.stream(params, on_results) return HttpResponse(json.dumps(queryResults), content_type="application/json")