def getPages(): likesCollection = getLikesCollection() pageCollection = getPageCollection() counter = 0 for likes in likesCollection.find(): likes = likes['data'] for like in likes: page = {'_id': like['id']} pageCollection.update(page, page, upsert=True) counter += 1 print counter print 'Total', counter, 'pages fetched'
def getPageData(): pageCollection = getPageCollection() pageDataCollection = getPageDataCollection() queue = Queue() index = 1 for page_id in pageCollection.find(): if not pageDataCollection.find_one(page_id): queue.put(page_id['_id']) index += 1 print index for i in range(200): t = fetchingPageData(queue) t.setDaemon(True) t.start() queue.join() for doc in pageDataCollection.find(): f = open('data/' + doc['_id'] + '.txt','w') f.write(doc['data']) f.close()
def getPageData(): pageCollection = getPageCollection() pageDataCollection = getPageDataCollection() queue = Queue() index = 1 for page_id in pageCollection.find(): if not pageDataCollection.find_one(page_id): queue.put(page_id['_id']) index += 1 print index for i in range(200): t = fetchingPageData(queue) t.setDaemon(True) t.start() queue.join() for doc in pageDataCollection.find(): f = open('data/' + doc['_id'] + '.txt', 'w') f.write(doc['data']) f.close()
from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection from pprint import pprint import dateutil.parser as dateparser allpages = getPageCollection() alllikes = getLikesCollection() fbpagesinfo = getPagesClusterInfoCollection() clusterinfo = getClusterCollection() fbpagesinfo.drop() counter = 0 for pageId in allpages.find(): cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}}) cluster = clusterinfo.find_one({'pages': pageId['_id']}) cluster = cluster["cluster"] document = {'_id': pageId['_id'], 'people': [], 'count': cursor.count(), 'cluster': cluster} for c in cursor: dd = {'id': c['id']} for pages in c['data']: if pages['id'] == pageId['_id']: if 'created_time' in pages: dd['created_time'] = dateparser.parse(pages['created_time']) break document['people'].append(dd) counter += 1 print 'document', counter, 'done' fbpagesinfo.insert(document)
from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection from pprint import pprint import dateutil.parser as dateparser allpages = getPageCollection() alllikes = getLikesCollection() fbpagesinfo = getPagesClusterInfoCollection() clusterinfo = getClusterCollection() fbpagesinfo.drop() counter = 0 for pageId in allpages.find(): cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}}) cluster = clusterinfo.find_one({'pages': pageId['_id']}) cluster = cluster["cluster"] document = { '_id': pageId['_id'], 'people': [], 'count': cursor.count(), 'cluster': cluster } for c in cursor: dd = {'id': c['id']} for pages in c['data']: if pages['id'] == pageId['_id']: if 'created_time' in pages: dd['created_time'] = dateparser.parse( pages['created_time']) break document['people'].append(dd)