def getPages():
    likesCollection = getLikesCollection()
    pageCollection = getPageCollection()
    counter = 0
    for likes in likesCollection.find():
        likes = likes['data']
        for like in likes:
            page = {'_id': like['id']}
            pageCollection.update(page, page, upsert=True)
            counter += 1
            print counter
    print 'Total', counter, 'pages fetched'
def getPageData():
	pageCollection = getPageCollection()
	pageDataCollection = getPageDataCollection()

	queue = Queue()
	index = 1
	for page_id in pageCollection.find():
		if not pageDataCollection.find_one(page_id):
			queue.put(page_id['_id'])
			index += 1

	print index

	for i in range(200):
		t = fetchingPageData(queue)
		t.setDaemon(True)
		t.start()

	queue.join()

	for doc in pageDataCollection.find():
		f = open('data/' + doc['_id'] + '.txt','w')
		f.write(doc['data'])
		f.close()
Example #3
0
def getPageData():
    pageCollection = getPageCollection()
    pageDataCollection = getPageDataCollection()

    queue = Queue()
    index = 1
    for page_id in pageCollection.find():
        if not pageDataCollection.find_one(page_id):
            queue.put(page_id['_id'])
            index += 1

    print index

    for i in range(200):
        t = fetchingPageData(queue)
        t.setDaemon(True)
        t.start()

    queue.join()

    for doc in pageDataCollection.find():
        f = open('data/' + doc['_id'] + '.txt', 'w')
        f.write(doc['data'])
        f.close()
from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection
from pprint import pprint
import dateutil.parser as dateparser

allpages = getPageCollection()
alllikes = getLikesCollection()
fbpagesinfo = getPagesClusterInfoCollection()
clusterinfo = getClusterCollection()

fbpagesinfo.drop()
counter = 0
for pageId in allpages.find():
    cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}})
    cluster = clusterinfo.find_one({'pages': pageId['_id']})
    cluster = cluster["cluster"]

    document = {'_id': pageId['_id'], 'people': [], 'count': cursor.count(), 'cluster': cluster}
    for c in cursor:
        dd = {'id': c['id']}
        for pages in c['data']:
            if pages['id'] == pageId['_id']:
                if 'created_time' in pages:
                    dd['created_time'] = dateparser.parse(pages['created_time'])
                    break
        document['people'].append(dd)
    counter += 1
    print 'document', counter, 'done'
    fbpagesinfo.insert(document)
Example #5
0
from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection
from pprint import pprint
import dateutil.parser as dateparser

allpages = getPageCollection()
alllikes = getLikesCollection()
fbpagesinfo = getPagesClusterInfoCollection()
clusterinfo = getClusterCollection()

fbpagesinfo.drop()
counter = 0
for pageId in allpages.find():
    cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}})
    cluster = clusterinfo.find_one({'pages': pageId['_id']})
    cluster = cluster["cluster"]

    document = {
        '_id': pageId['_id'],
        'people': [],
        'count': cursor.count(),
        'cluster': cluster
    }
    for c in cursor:
        dd = {'id': c['id']}
        for pages in c['data']:
            if pages['id'] == pageId['_id']:
                if 'created_time' in pages:
                    dd['created_time'] = dateparser.parse(
                        pages['created_time'])
                    break
        document['people'].append(dd)