def doClusteringOnUser(self): photos = self.getClusteringData() all_text = [] ei = MongoDBInterface() ei.setDB('citybeat_production') ei.setCollection('photos') user_cnt = 0 for p in photos: user_cnt+=1 if user_cnt%10==0: print 'user ', user_cnt user_name = p['user']['username'] user_photos = ei.getAllDocuments( {'user.username':user_name}) text = "" for tp in user_photos: try: text += tp['caption']['text'] except: continue all_text.append( text ) vectorizer = TfidfVectorizer(max_df = 0.1, lowercase = True, sublinear_tf=True, min_df=10, stop_words='english', use_idf=True) X = vectorizer.fit_transform(all_text) print 'shape = ',X.shape algo = KMeans(10) #algo = SpectralClustering(n_clusters=5) X = normalize(X) algo.fit(X) f = file(self.file_name_prefix+'text_on_user.csv', 'w') for idx in range(len(photos)): p = photos[idx] f.write( (str(p['location']['latitude'])+','+str(p['location']['longitude'])+','+str(algo.labels_[idx])+','+p['images']['standard_resolution']['url'] + '\n' ))
from photo_interface import PhotoInterface from caption_parser import CaptionParser from mongodb_interface import MongoDBInterface from photo import Photo import random if __name__ == '__main__': mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') cp = CaptionParser(True) i = 0 captions = mi.getAllDocuments() for caption in captions: i += 1 if i % 1000 == 0: # print cp.getTopWords(200) print i print len(cp._) cp.insertCaption(caption['caption']) for word, value in cp.getTopWords(300): print '\''+word+'\',', print
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0,10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption':cap} mi.saveDocument(cap)
from photo_interface import PhotoInterface from caption_parser import CaptionParser from mongodb_interface import MongoDBInterface from photo import Photo import random if __name__ == '__main__': mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') cp = CaptionParser(True) i = 0 captions = mi.getAllDocuments() for caption in captions: i += 1 if i % 1000 == 0: # print cp.getTopWords(200) print i print len(cp._) cp.insertCaption(caption['caption']) for word, value in cp.getTopWords(300): print '\'' + word + '\',', print
source_mongodb_port = 27017 source_connection = pymongo.Connection(source_mongodb_address, source_mongodb_port) source_connection['admin'].authenticate( 'admin', 'mediumdatarules') target_mongodb_address = 'grande.rutgers.edu' target_mongodb_port = 27017 target_connection = pymongo.Connection(target_mongodb_address, target_mongodb_port) print source_connection.database_names() print source_connection['citybeat_production'].collection_names() print target_connection['citybeat_production'].collection_names() for collection in source_connection['citybeat_production'].collection_names(): print 'start collection: ' + collection source_interface = MongoDBInterface() source_interface._connection = source_connection source_interface.setDB('citybeat_production') source_interface.setCollection(collection) target_interface = MongoDBInterface() target_interface._connection = target_connection target_interface.setDB('citybeat_production') target_interface.setCollection(collection) count = 0 for e in source_interface.getAllDocuments(): try: target_interface.saveDocument(e) count += 1 except Exception:
def main(): ec2 = MongoDBInterface() ec2.setDB('test_chaolun') ec2.setCollection('test') print ec2.getAllDocumentIDs()
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0, 10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption': cap} mi.saveDocument(cap)