def generateData(biased=True): ei = EventInterface() ei.setDB('historic_alarm') ei.setCollection('labeled_event') events = ei.getAllDocuments() EventFeature.GenerateArffFileHeader() true_events = [] false_events = [] for event in events: event = EventFeature(event) feature_vector = event.extractFeatures(3) if feature_vector[-1] == 1: true_events.append(feature_vector) else: false_events.append(feature_vector) random.shuffle(false_events) for fv in true_events: for i in xrange(0, len(fv) - 1): print fv[i], ',', print fv[-1] j = 0 for fv in false_events: for i in xrange(0, len(fv) - 1): print fv[i], ',', print fv[-1] j += 1 if not biased and j == len(true_events): break
def loadNextWeekData(): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('next_week_candidate_event_25by25_merged') true_events = [] false_events = [] fid2 = open('labeled_data_cf/label_next_week.txt', 'r') for line in fid2: t = line.split(',') id = str(t[0]) label = int(t[1]) event = ei.getDocument({'_id':ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: false_events.append(event) fid2.close() return true_events, false_events
def findTree(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = {} fid1 = open('labeled_data_cf/181_positive.txt', 'r') true_events = [] for line in fid1: t = line.split(',') id = str(t[0]) label = int(t[1]) if label == 1: pass else: continue events[id] = label fid1.close() words = ['motor'] for id, label in events.items(): event = ei.getEventByID(id) e = Event(event) if e.containKeywords(words, 1): print id
def loadNextWeekData(): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('next_week_candidate_event_25by25_merged') true_events = [] false_events = [] fid2 = open('labeled_data_cf/label_next_week.txt', 'r') for line in fid2: t = line.split(',') id = str(t[0]) label = int(t[1]) event = ei.getDocument({'_id': ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: false_events.append(event) fid2.close() return true_events, false_events
def generateData(biased=True): ei = EventInterface() ei.setDB("historic_alarm") ei.setCollection("labeled_event") events = ei.getAllDocuments() EventFeature.GenerateArffFileHeader() true_events = [] false_events = [] for event in events: event = EventFeature(event) feature_vector = event.extractFeatures(3) if feature_vector[-1] == 1: true_events.append(feature_vector) else: false_events.append(feature_vector) random.shuffle(false_events) for fv in true_events: for i in xrange(0, len(fv) - 1): print fv[i], ",", print fv[-1] j = 0 for fv in false_events: for i in xrange(0, len(fv) - 1): print fv[i], ",", print fv[-1] j += 1 if not biased and j == len(true_events): break
def buildCorpusOnDB(self, db, collection): ei = EventInterface() ei.setDB(db) ei.setCollection(collection) events = ei.getAllDocuments() for event in events: word_list = self.getWordList(event) self._addDocument(word_list)
def loadUnbalancedData(_182): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') true_events = [] false_events = [] if _182: fid2 = open('labeled_data_cf/182_positive.txt', 'r') else: fid2 = open('labeled_data_cf/181_positive.txt', 'r') modified_events = {} for line in fid2: t = line.split(',') modified_events[str(t[0])] = int(t[1]) fid2.close() # put the data into a text file first fid = open('labeled_data_cf/data2.txt','r') for line in fid: if len(line.strip()) == 0: continue t = line.strip().split() if not len(t) == 3: continue label = t[0].lower() confidence = float(t[1]) event_id = str(t[2].split('/')[-1]) if label == 'not_sure': continue if label == 'yes': label = 1 else: label = -1 event = ei.getDocument({'_id':ObjectId(event_id)}) event['label'] = label if modified_events.has_key(event_id): event['label'] = modified_events[event_id] e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: if event['label'] == -1 and confidence == 1: false_events.append(event) fid.close() return true_events, false_events
def loadUnbalancedData(_182): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') true_events = [] false_events = [] if _182: fid2 = open('labeled_data_cf/182_positive.txt', 'r') else: fid2 = open('labeled_data_cf/181_positive.txt', 'r') modified_events = {} for line in fid2: t = line.split(',') modified_events[str(t[0])] = int(t[1]) fid2.close() # put the data into a text file first fid = open('labeled_data_cf/data2.txt', 'r') for line in fid: if len(line.strip()) == 0: continue t = line.strip().split() if not len(t) == 3: continue label = t[0].lower() confidence = float(t[1]) event_id = str(t[2].split('/')[-1]) if label == 'not_sure': continue if label == 'yes': label = 1 else: label = -1 event = ei.getDocument({'_id': ObjectId(event_id)}) event['label'] = label if modified_events.has_key(event_id): event['label'] = modified_events[event_id] e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: if event['label'] == -1 and confidence == 1: false_events.append(event) fid.close() return true_events, false_events
def testWithPhoto(): corpus_all = buildAllCorpus(element_type='photos', debug=True) for key, corpus in corpus_all.items(): break ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') event = ei.getDocument() event = BaseFeatureProduction(event, corpus=corpus) print event.extractFeatures()
def main(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = ei.getAllDocuments() event = ei.getEventByID('511478c8c2a3754cfe6684a9') print event['region'] lat = (event['region']['min_lat'] + event['region']['max_lat']) / 2 lon = (event['region']['min_lng'] + event['region']['max_lng']) / 2 fid1 = open('region_cache/25_25.txt', 'r') for line in fid1: cor = line.split(' ') for i in xrange(len(cor)): cor[i] = float(cor[i]) if float(cor[0]) <= lat and lat <= float(cor[2]) and float( cor[1]) <= lon and lon <= float(cor[3]): min_lat = cor[0] max_lat = cor[2] min_lng = cor[1] max_lng = cor[3] print min_lat, max_lat, min_lng, max_lng break fid1.close() fid2 = open('labeled_data_cf/181_positive.txt', 'r') labels = {} for line in fid2: t = line.split(',') labels[str(t[0])] = int(t[1]) fid2.close() pos = 0 tot = 0 for event in events: region = event['region'] id = str(event['_id']) if id not in labels.keys(): continue if (floatEqual(region['min_lat'], min_lat) and floatEqual(region['max_lat'], max_lat) and floatEqual(region['min_lng'], min_lng) and floatEqual(region['max_lng'], max_lng)): tot += 1 if labels[id] == 1: pos += 1 print id print pos print tot
def testWithMerge(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25') ei2 = EventInterface() ei2.setDB('test') ei2.setCollection('candidate_event') cur = ei.getAllDocuments() for event in cur: ei2.addEvent(event)
def mergeBaselineEvents(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('baseline_candidate_events') ei2 = EventInterface() ei2.setDB('citybeat') ei2.setCollection('baseline_candidate_events_merged') events = ei.getAllDocuments() for event in events: ei2.addEvent(event)
def main(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = ei.getAllDocuments() event = ei.getEventByID('511478c8c2a3754cfe6684a9') print event['region'] lat = (event['region']['min_lat'] + event['region']['max_lat'])/2 lon = (event['region']['min_lng'] + event['region']['max_lng'])/2 fid1 = open('region_cache/25_25.txt', 'r') for line in fid1: cor = line.split(' ') for i in xrange(len(cor)): cor[i] = float(cor[i]) if float(cor[0]) <= lat and lat <= float(cor[2]) and float(cor[1]) <= lon and lon <= float(cor[3]): min_lat = cor[0] max_lat = cor[2] min_lng = cor[1] max_lng = cor[3] print min_lat, max_lat, min_lng, max_lng break fid1.close() fid2 = open('labeled_data_cf/181_positive.txt', 'r') labels = {} for line in fid2: t = line.split(',') labels[str(t[0])] = int(t[1]) fid2.close() pos = 0 tot = 0 for event in events: region = event['region'] id = str(event['_id']) if id not in labels.keys(): continue if (floatEqual(region['min_lat'], min_lat) and floatEqual(region['max_lat'], max_lat) and floatEqual(region['min_lng'], min_lng) and floatEqual(region['max_lng'], max_lng)): tot += 1 if labels[id] == 1: pos += 1 print id print pos print tot
def insertEvents(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') ei2 = EventInterface() ei2.setDB('citybeat') ei2.setCollection('online_candidate') ids = ['51148288c2a3754cfe668edd', '51147952c2a3754cfe6684ee', '51148a7ec2a3754cfe669977', '51147967c2a3754cfe668503'] for id in ids: event = ei.getDocument({'_id':ObjectId(id)}) ei2.addEvent(event)
def getAllActualEvents(): ei = EventInterface() ei.setDB("citybeat") ei.setCollection("candidate_event_25by25_merged") true_events = [] false_events = [] fid2 = open("labeled_data_cf/181_positive.txt", "r") modified_events = {} for line in fid2: t = line.split(",") modified_events[str(t[0])] = int(t[1]) fid2.close() # put the data into a text file first fid = open("labeled_data_cf/data2.txt", "r") for line in fid: if len(line.strip()) == 0: continue t = line.strip().split() if not len(t) == 3: continue label = t[0].lower() confidence = float(t[1]) event_id = str(t[2].split("/")[-1]) if label == "not_sure": continue if label == "yes": label = 1 else: label = -1 event = ei.getDocument({"_id": ObjectId(event_id)}) event["label"] = label if modified_events.has_key(event_id): event["label"] = modified_events[event_id] e = Event(event) if e.getActualValue() < 8 or event["label"] == 0: # print 'bad event ' + id continue if event["label"] == 1: true_events.append(event) fid.close() return true_events
def insertEvents(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') ei2 = EventInterface() ei2.setDB('citybeat') ei2.setCollection('online_candidate') ids = [ '51148288c2a3754cfe668edd', '51147952c2a3754cfe6684ee', '51148a7ec2a3754cfe669977', '51147967c2a3754cfe668503' ] for id in ids: event = ei.getDocument({'_id': ObjectId(id)}) ei2.addEvent(event)
def testWithTweet(): cnt = 0 corpus_all = buildAllCorpus(element_type='tweets', debug=False) ei = EventInterface() ei.setDB('citybeat_experiment') ei.setCollection('twitter_candidate_events') cur = ei.getAllDocuments() print TwitterFeature.GenerateArffFileHeader() for event in cur: region = Region(event['region']) event = TwitterFeature(event, corpus=corpus_all[region.getKey()]) if event.getActualValue() < 8: print '< 8' continue cnt += 1 print event.extractFeatures() print cnt, cur.count()
def testWithTweet(): from corpus import buildAllCorpus corpus_all = buildAllCorpus(element_type="tweets", debug=True) for key, corpus in corpus_all.items(): break ei = EventInterface() ei.setDB("citybeat") ei.setCollection("candidate_event_25by25_merged") event = ei.getDocument() print event ti = TweetInterface() cur = ti.getAllDocuments(limit=30) tweets = [] for tweet in cur: tweets.append(tweet) del event["photos"] event["tweets"] = tweets event = BaseFeature(event, corpus=corpus) print event.printFeatures()
def getBaselineEvents(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('baseline_candidate_events') events = ei.getAllDocuments() event_list = [] for event in events: e = Event(event) if e.getActualValue() < 8 or e.getZscore() < 3: continue event_list.append(event) # print len(event_list) # return random.shuffle(event_list) for i in xrange(50): print event_list[i]['_id']
def generateTrueLabelFile(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = {} fid1 = open('labeled_data_cf/181_positive.txt', 'r') true_events = [] false_events = [] unknown_events = [] for line in fid1: t = line.split(',') id = str(t[0]) label = int(t[1]) events[id] = label fid1.close() for id, label in events.items(): event = ei.getDocument({'_id':ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8: # print 'bad event ' + id continue if event['label'] == -1: false_events.append(event) else: if event['label'] == 1: true_events.append(event) else: unknown_events.append(event) for event in true_events + false_events + unknown_events: print str(event['_id'])+','+str(event['label'])
def generateTrueLabelFile(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = {} fid1 = open('labeled_data_cf/181_positive.txt', 'r') true_events = [] false_events = [] unknown_events = [] for line in fid1: t = line.split(',') id = str(t[0]) label = int(t[1]) events[id] = label fid1.close() for id, label in events.items(): event = ei.getDocument({'_id': ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8: # print 'bad event ' + id continue if event['label'] == -1: false_events.append(event) else: if event['label'] == 1: true_events.append(event) else: unknown_events.append(event) for event in true_events + false_events + unknown_events: print str(event['_id']) + ',' + str(event['label'])
from region import Region from bson.objectid import ObjectId from event import Event from datetime import datetime import random n = 300 ei = EventInterface() ei.setCollection("candidate_event_10by10_merged") events = ei.getAllDocuments() event_list = [] for event in events: event_list.append(event) random.shuffle(event_list) ei2 = EventInterface() ei2.setDB("label") ei2.setCollection("label_10by10") i = 0 for event in event_list: ei2.saveDocument(event) i += 1 if i == 300: break
class Representor(): def __init__(self, vectorizer = None, db='AmazonMT', collection='candidate_event_25by25_merged'): """Given an event, return a list incices of the photos in 'photos' filed which are representative to stands for this cluster Could overwrite TfidfVectorizer as a parameter so that you could customize your own tfidf parameters. see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html """ self.ei = EventInterface() self.ei.setDB(db) self.ei.setCollection(collection) self.events = [] for e in self.ei.getAllDocuments(): event = Event(e) event.selectOnePhotoForOneUser() e = event.toJSON() self.events.append(e) #self.events = [e for e in self.ei.getAllDocuments()] self._captions = self._getAllCaptions() if vectorizer is None: self.vectorizer = TfidfVectorizer( max_df=0.05, min_df = 1, strip_accents='ascii', smooth_idf=True, preprocessor = self._preProcessor, sublinear_tf=True, norm = 'l2', analyzer='char_wb', ngram_range=(4,4), stop_words = 'english') else: self.vectorizer = vectorizer self.vectorizer.fit_transform(self._captions) # print self.vectorizer.get_feature_names() def _preProcessor(self, caption): regex = re.compile(r"#\w+") match = regex.findall(caption) if len(match)>=5: return "" else: return caption def _getAllCaptions(self): _captions = [] for event in self.events: _captions += self._getEventCaptions(event) return _captions def _is_ascii(self, _str): return all(ord(c) < 128 for c in _str) def _getEventCaptions(self, event): """For a given event, return the captions as a list. Note for photo without caption, use a None to hold the place""" event_captions = [] for p in event['photos']: try: if self._is_ascii(p['caption']['text']): event_captions.append( p['caption']['text'].lower() ) else: event_captions.append("") except: event_captions.append( "" ) return event_captions def _cosine_sim(self, a, b): return a*b.T def getRepresentivePhotos(self, event): event_captions = self._getEventCaptions(event) event_tfidf = self.vectorizer.transform(event_captions) centroid = event_tfidf.mean(axis=0) #cosine_similarities = linear_kernel(centroid, event_tfidf).flatten() cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten() most_related_pics = cosine_similarities.argsort() photos_to_return = [] #print event['_id'] for idx in most_related_pics: # print cosine_similarities[idx], event['photos'][idx]['link'] photos_to_return.append( event['photos'][idx] ) photos_to_return.reverse() return photos_to_return def getTfidfVector(self, event): voc = self.vectorizer.get_feature_names() tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0) nonzeros = np.nonzero(tf_vec)[1] res_list = nonzeros.ravel().tolist()[0] values = [] words = [] for n in res_list: words.append( voc[n] ) values.append( tf_vec[0,n] ) return res_list, words, values def getCorpusWordsVector(self): return self.vectorizer.get_feature_names()
from event_interface import EventInterface from bson.objectid import ObjectId from event_feature import EventFeature ei = EventInterface() ei.setDB('historic_alarm') ei.setCollection('raw_event') ei2 = EventInterface() ei2.setDB('historic_alarm') ei2.setCollection('labeled_event') #fid = open('final_labels.txt', 'r') # #for line in fid: # vals = line.split() # label = -1 # if len(vals) > 1 and vals[1] == '1': # label = 1 # event = ei.getDocument({'_id':ObjectId(vals[0])}) # event['label'] = label # ei2.updateDocument(event) events = ei2.getAllDocuments() for event in events: label = event['label'] event = EventFeature(event) (lat, lng) = event._getPhotoAvgLocation() print lat, lng, label
from event_interface import EventInterface from bson.objectid import ObjectId from event_feature import EventFeature ei = EventInterface() ei.setDB('historic_alarm') ei.setCollection('raw_event') ei2 = EventInterface() ei2.setDB('historic_alarm') ei2.setCollection('labeled_event') #fid = open('final_labels.txt', 'r') # #for line in fid: # vals = line.split() # label = -1 # if len(vals) > 1 and vals[1] == '1': # label = 1 # event = ei.getDocument({'_id':ObjectId(vals[0])}) # event['label'] = label # ei2.updateDocument(event) events = ei2.getAllDocuments() for event in events: label = event['label'] event = EventFeature(event) (lat, lng) = event._getPhotoAvgLocation()
from region import Region from bson.objectid import ObjectId from event import Event from datetime import datetime import random n = 300 ei = EventInterface() ei.setCollection('candidate_event_10by10_merged') events = ei.getAllDocuments() event_list = [] for event in events: event_list.append(event) random.shuffle(event_list) ei2 = EventInterface() ei2.setDB('label') ei2.setCollection('label_10by10') i = 0 for event in event_list: ei2.saveDocument(event) i += 1 if i == 300: break
from datetime import datetime from event_interface import EventInterface def getDate(utc_time): return repr(datetime.fromtimestamp(int(utc_time))) ei = EventInterface() ei.setDB('citybeat') ei.setCollection('next_week_candidate_event_25by25') ei2 = EventInterface() ei2.setDB('citybeat') ei2.setCollection('next_week_candidate_event_25by25_merged') events = ei.getAllDocuments().sort('created_time', 1) for event in events: if event['actual_value'] >= 8 and event['zscore'] >= 3.0: ei2.addEvent(event) #region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847} #utc_time = str(1354728300)<div style="text-align: left"></div> #region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003} #utc_time = str(1354340400) # #condition = ({'region.min_lat':region['min_lat'],
from event_interface import EventInterface ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') ei2 = EventInterface() ei2.setDB('AmazonMT') ei2.setCollection('candidate_event_25by25_merged') events = ei.getAllDocuments() for event in events: ei2.saveDocument(event)
from event_interface import EventInterface from event_feature import EventFeature from photo_interface import PhotoInterface from photo import Photo from region import Region from event import Event from caption_parser import CaptionParser from stopwords import Stopwords import operator import string import types import random import math ei = EventInterface() ei.setDB('AmazonMT') ei.setCollection('candidate_event_25by25_merged') events = ei.getAllDocuments() duplicates = 0 for event in events: e = Event(event) flag = e.removeDuplicatePhotos() if flag > 0: print e.getPhotoNumber(), e.getActualValue() ei.updateDocument(e)
class Representor: def __init__(self, vectorizer=None, db="AmazonMT", collection="candidate_event_25by25_merged"): """Given an event, return a list incices of the photos in 'photos' filed which are representative to stands for this cluster Could overwrite TfidfVectorizer as a parameter so that you could customize your own tfidf parameters. see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html """ self.ei = EventInterface() self.ei.setDB(db) self.ei.setCollection(collection) self.events = [] for e in self.ei.getAllDocuments(): event = Event(e) event.selectOnePhotoForOneUser() e = event.toJSON() self.events.append(e) # self.events = [e for e in self.ei.getAllDocuments()] self._captions = self._getAllCaptions() if vectorizer is None: self.vectorizer = TfidfVectorizer( max_df=0.05, min_df=1, strip_accents="ascii", smooth_idf=True, preprocessor=self._preProcessor, sublinear_tf=True, norm="l2", analyzer="char_wb", ngram_range=(4, 4), stop_words="english", ) else: self.vectorizer = vectorizer self.vectorizer.fit_transform(self._captions) # print self.vectorizer.get_feature_names() def _preProcessor(self, caption): regex = re.compile(r"#\w+") match = regex.findall(caption) if len(match) >= 5: return "" else: return caption def _getAllCaptions(self): _captions = [] for event in self.events: _captions += self._getEventCaptions(event) return _captions def _is_ascii(self, _str): return all(ord(c) < 128 for c in _str) def _getEventCaptions(self, event): """For a given event, return the captions as a list. Note for photo without caption, use a None to hold the place""" event_captions = [] for p in event["photos"]: try: if self._is_ascii(p["caption"]["text"]): event_captions.append(p["caption"]["text"].lower()) else: event_captions.append("") except: event_captions.append("") return event_captions def _cosine_sim(self, a, b): return a * b.T def getRepresentivePhotos(self, event): event_captions = self._getEventCaptions(event) event_tfidf = self.vectorizer.transform(event_captions) centroid = event_tfidf.mean(axis=0) # cosine_similarities = linear_kernel(centroid, event_tfidf).flatten() cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten() most_related_pics = cosine_similarities.argsort() photos_to_return = [] # print event['_id'] for idx in most_related_pics: # print cosine_similarities[idx], event['photos'][idx]['link'] photos_to_return.append(event["photos"][idx]) photos_to_return.reverse() return photos_to_return def getTfidfVector(self, event): voc = self.vectorizer.get_feature_names() tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0) nonzeros = np.nonzero(tf_vec)[1] res_list = nonzeros.ravel().tolist()[0] values = [] words = [] for n in res_list: words.append(voc[n]) values.append(tf_vec[0, n]) return res_list, words, values def getCorpusWordsVector(self): return self.vectorizer.get_feature_names()
res.append([word, fre, photos[0:k]]) return res def getTopKeywordsAndPhotos(self, num_keywords, num_photos): keywords = self._getTopKeywordsWithoutStopwords(num_keywords) return self._getRandomPhotosAssociatedWithKeywords( keywords, num_photos) def getTopKeywordsAndPhotosByTFIDF(self, num_keywords, num_photos): keywords = self._getTopKeywordsWithoutStopwords(100000) keywords = self._corpus.chooseTopWordWithHighestTDIDF( keywords, num_keywords) return self._getRandomPhotosAssociatedWithKeywords( keywords, num_photos) if __name__ == '__main__': collection = 'candidate_event_10by10_merged' c = Corpus() c.buildCorpusOnDB('citybeat', collection) ei = EventInterface() ei.setDB('citybeat') ei.setCollection(collection) events = ei.getAllDocuments() for event in events: event = EventFrontend(event, c) print event.getTopKeywordsAndPhotosByTFIDF(10, 0)
from prediction_interface import PredictionInterface from region import Region from bson.objectid import ObjectId from event import Event from datetime import datetime def getDate(utc_time): return repr(datetime.fromtimestamp(int(utc_time))) ei = EventInterface() ei.setDB("citybeat") ei.setCollection("next_week_candidate_event_25by25") ei2 = EventInterface() ei2.setDB("citybeat") ei2.setCollection("next_week_candidate_event_25by25_merged") events = ei.getAllDocuments().sort("created_time", 1) for event in events: if event["actual_value"] >= 8 and event["zscore"] >= 3.0: ei2.addEvent(event) # region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847} # utc_time = str(1354728300)<div style="text-align: left"></div>