def generateData(): rep = Representor(None, 'citybeat', 'next_week_candidate_event_25by25_merged') corpus = Corpus() corpus.buildCorpusOnDB('citybeat', 'next_week_candidate_event_25by25_merged') true_event_list, false_event_list = loadNextWeekData() EventFeatureTwitter(None).GenerateArffFileHeader() for event in true_event_list + false_event_list: EventFeatureTwitter(event, corpus, rep).printFeatures()
def generateData2(_182, sparse=False): # if sparse: rep = Representor() corpus = Corpus() corpus.buildCorpusOnDB('citybeat', 'candidate_event_25by25_merged') true_event_list, false_event_list = loadUnbalancedData(_182) if sparse: word_index, word_list = getCorpusWordList(rep, true_event_list + false_event_list) EventFeatureSparse(None).GenerateArffFileHeader(word_list) else: EventFeatureTwitter(None).GenerateArffFileHeader() for event in true_event_list + false_event_list: if not sparse: EventFeatureTwitter(event, corpus, rep).printFeatures() else: EventFeatureSparse(event, corpus, rep).printFeatures(word_index)
def generateData2(_182, sparse=False): # if sparse: rep = Representor() corpus = Corpus() corpus.buildCorpusOnDB('citybeat', 'candidate_event_25by25_merged') true_event_list, false_event_list = loadUnbalancedData(_182) if sparse: word_index, word_list = getCorpusWordList( rep, true_event_list + false_event_list) EventFeatureSparse(None).GenerateArffFileHeader(word_list) else: EventFeatureTwitter(None).GenerateArffFileHeader() for event in true_event_list + false_event_list: if not sparse: EventFeatureTwitter(event, corpus, rep).printFeatures() else: EventFeatureSparse(event, corpus, rep).printFeatures(word_index)
res.append([word, fre, photos[0:k]]) return res def getTopKeywordsAndPhotos(self, num_keywords, num_photos): keywords = self._getTopKeywordsWithoutStopwords(num_keywords) return self._getRandomPhotosAssociatedWithKeywords( keywords, num_photos) def getTopKeywordsAndPhotosByTFIDF(self, num_keywords, num_photos): keywords = self._getTopKeywordsWithoutStopwords(100000) keywords = self._corpus.chooseTopWordWithHighestTDIDF( keywords, num_keywords) return self._getRandomPhotosAssociatedWithKeywords( keywords, num_photos) if __name__ == '__main__': collection = 'candidate_event_10by10_merged' c = Corpus() c.buildCorpusOnDB('citybeat', collection) ei = EventInterface() ei.setDB('citybeat') ei.setCollection(collection) events = ei.getAllDocuments() for event in events: event = EventFrontend(event, c) print event.getTopKeywordsAndPhotosByTFIDF(10, 0)
k = min(len(photos), k) # discard the keywords with only one photo # if k == 1: # break res.append([word, fre, photos[0:k]]) return res def getTopKeywordsAndPhotos(self, num_keywords, num_photos): keywords = self._getTopKeywordsWithoutStopwords(num_keywords) return self._getRandomPhotosAssociatedWithKeywords(keywords, num_photos) def getTopKeywordsAndPhotosByTFIDF(self, num_keywords, num_photos): keywords = self._getTopKeywordsWithoutStopwords(100000) keywords = self._corpus.chooseTopWordWithHighestTDIDF(keywords, num_keywords) return self._getRandomPhotosAssociatedWithKeywords(keywords, num_photos) if __name__=='__main__': collection = 'candidate_event_10by10_merged' c = Corpus() c.buildCorpusOnDB('citybeat', collection) ei = EventInterface() ei.setDB('citybeat') ei.setCollection(collection) events = ei.getAllDocuments() for event in events: event = EventFrontend(event, c) print event.getTopKeywordsAndPhotosByTFIDF(10,0)