def getWordList(self, event): # word_list is a list of (word, freq) cp = CaptionParser(True) for photo in event['photos']: photo = Photo(photo) cp.insertCaption(photo.getCaption()) return cp.getTopWords(-1, False)
def _getTopWords(self, k, stopword_removal=False): caption_parser = CaptionParser(stopword_removal=stopword_removal) for photo in self._event["photos"]: p = Photo(photo) caption = p.getCaption() if not caption is None: caption_parser.insertCaption(caption) return caption_parser.getTopWords(k)
def _getTopWords(self, k, stopword_removal=False): caption_parser = CaptionParser(stopword_removal=stopword_removal) for photo in self._event['photos']: p = Photo(photo) caption = p.getCaption() if not caption is None: caption_parser.insertCaption(caption) return caption_parser.getTopWords(k)
def _getTopWords(self, k, stopword_removal=False): # get top words by counting the frequecy caption_parser = CaptionParser(stopword_removal=stopword_removal) for photo in self._event['photos']: p = Photo(photo) caption = p.getCaption() if not caption is None: caption_parser.insertCaption(caption) return caption_parser.getTopWords(k)
def PhotoDistanceByCaption(photo1, photo2): p1 = Photo(photo1) p2 = Photo(photo2) cap1 = p1.getCaption() cap2 = p2.getCaption() cp1 = CaptionParser(True) cp1.insertCaption(cap1) cp2 = CaptionParser(True) cp2.insertCaption(cap2) word_list1 = cp1.getTopWords(-1) word_list2 = cp2.getTopWords(-1) if len(word_list1) == 0 or len(word_list2) == 0: # unable to compare return None word_dict1 = {} for word, freq in word_list1: word_dict1[word] = freq word_dict2 ={} for word, freq in word_list2: word_dict2[word] = freq return kldiv(word_dict1, word_dict2)
from photo_interface import PhotoInterface from caption_parser import CaptionParser from mongodb_interface import MongoDBInterface from photo import Photo import random if __name__ == '__main__': mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') cp = CaptionParser(True) i = 0 captions = mi.getAllDocuments() for caption in captions: i += 1 if i % 1000 == 0: # print cp.getTopWords(200) print i print len(cp._) cp.insertCaption(caption['caption']) for word, value in cp.getTopWords(300): print '\''+word+'\',', print
from photo_interface import PhotoInterface from caption_parser import CaptionParser from mongodb_interface import MongoDBInterface from photo import Photo import random if __name__ == '__main__': mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') cp = CaptionParser(True) i = 0 captions = mi.getAllDocuments() for caption in captions: i += 1 if i % 1000 == 0: # print cp.getTopWords(200) print i print len(cp._) cp.insertCaption(caption['caption']) for word, value in cp.getTopWords(300): print '\'' + word + '\',', print