Exemple #1
0
    def removeDuplicatePhotos(self):
        # this method is not good, just for tempory use
        # by judging if the caption is duplicate
        new_photos = []
        num_duplicate = 0
        for photo in self._event['photos']:
            p = Photo(photo)
            is_duplicate = False
            cap1 = p.getCaption()
            user1 = p.getUserName()
            for new_photo in new_photos:
                p2 = Photo(new_photo)
                cap2 = p2.getCaption()
                user2 = p2.getUserName()
                if user1 == user2 and (len(cap1) > 0 and cap1 == cap2):
                    is_duplicate = True
                    num_duplicate += 1
                    break
            if not is_duplicate:
                new_photos.append(photo)

        if num_duplicate > 0:
            self._event['photos'] = new_photos

        return num_duplicate
Exemple #2
0
	def removeDuplicatePhotos(self):
		# this method is not good, just for tempory use
		# by judging if the caption is duplicate
		new_photos = []
		num_duplicate = 0
		for photo in self._event['photos']:
			p = Photo(photo)
			is_duplicate = False
			cap1 = p.getCaption()
			user1 = p.getUserName()
			for new_photo in new_photos:
				p2 = Photo(new_photo)
				cap2 = p2.getCaption()
				user2 = p2.getUserName()
				if user1 == user2 and (len(cap1)>0 and cap1 == cap2):
					is_duplicate = True
					num_duplicate += 1
					break
			if not is_duplicate:
				new_photos.append(photo)
				
		if num_duplicate > 0:
			self._event['photos'] = new_photos
			
		return num_duplicate
Exemple #3
0
	def getWordList(self, event):
		# word_list is a list of (word, freq)
		cp = CaptionParser(True)
		for photo in event['photos']:
			photo = Photo(photo)
			cp.insertCaption(photo.getCaption())
		return cp.getTopWords(-1, False)
Exemple #4
0
 def _getTopWords(self, k, stopword_removal=False):
     caption_parser = CaptionParser(stopword_removal=stopword_removal)
     for photo in self._event['photos']:
         p = Photo(photo)
         caption = p.getCaption()
         if not caption is None:
             caption_parser.insertCaption(caption)
     return caption_parser.getTopWords(k)
Exemple #5
0
 def _getTopWords(self, k, stopword_removal=False):
     caption_parser = CaptionParser(stopword_removal=stopword_removal)
     for photo in self._event["photos"]:
         p = Photo(photo)
         caption = p.getCaption()
         if not caption is None:
             caption_parser.insertCaption(caption)
     return caption_parser.getTopWords(k)
Exemple #6
0
 def getCaptionPercentage(self):
     cap_number = 0
     photos = self._event['photos']
     for photo in photos:
         photo = Photo(photo)
         cap_len = len(photo.getCaption())
         if cap_len > 0:
             cap_number += 1
     return cap_number * 1.0 / len(photos)
Exemple #7
0
 def getCaptionPercentage(self):
     cap_number = 0
     photos = self._event["photos"]
     for photo in photos:
         photo = Photo(photo)
         cap_len = len(photo.getCaption())
         if cap_len > 0:
             cap_number += 1
     return cap_number * 1.0 / len(photos)
 def _getTopWords(self, k, stopword_removal=False):
     # get top words by counting the frequecy
     text_parser = TextParser(stopword_removal=stopword_removal)
     for photo in self._event['photos']:
         p = Photo(photo)
         caption = p.getCaption()
         if not caption is None:
             text_parser.insertCaption(caption)
     return text_parser.getTopWords(k)
Exemple #9
0
	def computeWordKLDivergenceWithByEddie(self, event):
		# this method calls the kl divergence computation by eddie's methods
		text1 = ''
		text2 = ''
		for photo in self._event['photos']:
			p = Photo(photo)
			text1 += ' '
			text1 += p.getCaption()
		
		if type(event) is types.DictType:
			pass
		else:
			event = event.toJSON()
			
		for photo in event['photos']:
			p = Photo(photo)
			text2 += ' '
			text2 += p.getCaption()
		return kldiv(tokenize(text1), tokenize(text2))
    def computeWordKLDivergenceWithByEddie(self, event):
        # this method calls the kl divergence computation by eddie's methods
        text1 = ''
        text2 = ''
        for photo in self._event['photos']:
            p = Photo(photo)
            text1 += ' '
            text1 += p.getCaption()

        if type(event) is types.DictType:
            pass
        else:
            event = event.toDict()

        for photo in event['photos']:
            p = Photo(photo)
            text2 += ' '
            text2 += p.getCaption()
        return kldiv(tokenize(text1), tokenize(text2))
        def PhotoDistanceByCaption(photo1, photo2):

            p1 = Photo(photo1)
            p2 = Photo(photo2)
            cap1 = p1.getCaption()
            cap2 = p2.getCaption()
            cp1 = TextParser(True)
            cp1.insertCaption(cap1)
            cp2 = TextParser(True)
            cp2.insertCaption(cap2)
            word_list1 = cp1.getTopWords(-1)
            word_list2 = cp2.getTopWords(-1)
            if len(word_list1) == 0 or len(word_list2) == 0:
                # unable to compare
                return None
            word_dict1 = {}
            for word, freq in word_list1:
                word_dict1[word] = freq
            word_dict2 = {}
            for word, freq in word_list2:
                word_dict2[word] = freq
            return kldiv(word_dict1, word_dict2)
Exemple #12
0
		def PhotoDistanceByCaption(photo1, photo2):
			
			p1 = Photo(photo1)
			p2 = Photo(photo2)
			cap1 = p1.getCaption()
			cap2 = p2.getCaption()
			cp1 = CaptionParser(True)
			cp1.insertCaption(cap1)
			cp2 = CaptionParser(True)
			cp2.insertCaption(cap2)
			word_list1 = cp1.getTopWords(-1)
			word_list2 = cp2.getTopWords(-1)
			if len(word_list1) == 0 or len(word_list2) == 0:
				# unable to compare
				return None
			word_dict1 = {}
			for word, freq in word_list1:
				word_dict1[word] = freq
			word_dict2 ={}
			for word, freq in word_list2:
				word_dict2[word] = freq
			return kldiv(word_dict1, word_dict2)
Exemple #13
0
 def getAvgCaptionLen(self):
     cap_number = 0
     cap_lens = 0
     photos = self._event['photos']
     for photo in photos:
         photo = Photo(photo)
         cap_len = len(photo.getCaption())
         if cap_len > 0:
             cap_lens += cap_len
             cap_number += 1
     if cap_number == 0:
         return -1
     else:
         return 1.0 * cap_lens / cap_number
Exemple #14
0
 def getAvgCaptionLen(self):
     cap_number = 0
     cap_lens = 0
     photos = self._event["photos"]
     for photo in photos:
         photo = Photo(photo)
         cap_len = len(photo.getCaption())
         if cap_len > 0:
             cap_lens += cap_len
             cap_number += 1
     if cap_number == 0:
         return -1
     else:
         return 1.0 * cap_lens / cap_number
 def countHashtagsFromPhotosContainingTopKeywords(self, k=3):
     # count the number of hashtags of photos that associated with topwords
     # k is the number of top keywords
     # rank top keywords by counting their frequency
     word_photo_list = self.getTopKeywordsAndPhotos(k, 10000)
     cnt = [0] * k
     cnt2 = [0] * k
     for i in xrange(0, len(word_photo_list)):
         j = 0
         for photo in word_photo_list[i][2]:
             p = Photo(photo)
             cap = p.getCaption()
             j += 1
             cnt[i] += cap.count('#')
             # return the number of hashtags
         cnt[i] = cnt[i] * 1.0 / j
         # reteurn the number of photos
         cnt2[i] = len(word_photo_list[i][2])
     return [cnt, cnt2]
Exemple #16
0
	def countHashtagsFromPhotosContainingTopKeywords(self, k=3):
		# count the number of hashtags of photos that associated with topwords
		# k is the number of top keywords
		# rank top keywords by counting their frequency
		word_photo_list = self.getTopKeywordsAndPhotos(k, 10000)
		cnt = [0]*k
		cnt2 = [0]*k
		for i in xrange(0, len(word_photo_list)):
			j = 0
			for photo in word_photo_list[i][2]:
				p = Photo(photo)
				cap = p.getCaption()
				j += 1
				cnt[i] += cap.count('#')
			# return the number of hashtags
			cnt[i] = cnt[i] * 1.0 / j
			# reteurn the number of photos
			cnt2[i] = len(word_photo_list[i][2])
		return [cnt, cnt2]				
Exemple #17
0
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random

if __name__ == '__main__':
    pi = PhotoInterface()
    pi.setDB('citybeat')
    pi.setCollection('photos')

    mi = MongoDBInterface()
    mi.setDB('test_caption')
    mi.setCollection('captions')

    photos = pi.getAllDocuments()
    for photo in photos:
        i = random.randint(0, 10)
        if i > 0:
            continue
        p = Photo(photo)
        cap = p.getCaption()
        if len(cap) > 0:
            cap = {'caption': cap}
            mi.saveDocument(cap)
from photo_interface import PhotoInterface
from caption_parser import CaptionParser
from photo import Photo
from mongodb_interface import MongoDBInterface

import random


if __name__ == '__main__':
	pi = PhotoInterface()
	pi.setDB('citybeat')
	pi.setCollection('photos')
	
	mi = MongoDBInterface()
	mi.setDB('test_caption')
	mi.setCollection('captions')
	
	photos = pi.getAllDocuments()
	for photo in photos:
		i = random.randint(0,10)
		if i > 0:
			continue
		p = Photo(photo)
		cap = p.getCaption()
		if len(cap) > 0:
			cap = {'caption':cap}
			mi.saveDocument(cap)