def getPageData():
	pageCollection = getPageCollection()
	pageDataCollection = getPageDataCollection()

	queue = Queue()
	index = 1
	for page_id in pageCollection.find():
		if not pageDataCollection.find_one(page_id):
			queue.put(page_id['_id'])
			index += 1

	print index

	for i in range(200):
		t = fetchingPageData(queue)
		t.setDaemon(True)
		t.start()

	queue.join()

	for doc in pageDataCollection.find():
		f = open('data/' + doc['_id'] + '.txt','w')
		f.write(doc['data'])
		f.close()
Beispiel #2
0
def getPageData():
    pageCollection = getPageCollection()
    pageDataCollection = getPageDataCollection()

    queue = Queue()
    index = 1
    for page_id in pageCollection.find():
        if not pageDataCollection.find_one(page_id):
            queue.put(page_id['_id'])
            index += 1

    print index

    for i in range(200):
        t = fetchingPageData(queue)
        t.setDaemon(True)
        t.start()

    queue.join()

    for doc in pageDataCollection.find():
        f = open('data/' + doc['_id'] + '.txt', 'w')
        f.write(doc['data'])
        f.close()
	def __init__(self, queue):
		threading.Thread.__init__(self)
		self.queue = queue
		self.collection = getPageDataCollection()
Beispiel #4
0
 def __init__(self, queue):
     threading.Thread.__init__(self)
     self.queue = queue
     self.collection = getPageDataCollection()
    try:
        k = KVAL
    except ValueError():
        usage()

    vocab = {}
    xs = []
    args = []
    fid = ''
    fetchTestpage(PID)

    clusterCollection = getClusterCollection()

    clusters = clusterCollection.distinct("cluster")

    fbdataCollection = getPageDataCollection()
    for cluster in clusterCollection.find():
        pageIds = cluster['pages'][:len(cluster['pages']) / 10]
        for p in pageIds:
            data = fbdataCollection.find_one({'_id': p})
            _id = data['_id']
            string = data['data']
            f = open('temp/' + _id + '.txt', 'w')
            f.write(string)
            f.close()

    for name in glob.glob('./temp/*.txt'):
        args.append(name)

    for a in args:
        x = defaultdict(float)
Beispiel #6
0
    try:
        k = KVAL
    except ValueError():
        usage()

    vocab = {}
    xs = []
    args = []
    fid = ''
    fetchTestpage(PID)

    clusterCollection = getClusterCollection()

    clusters = clusterCollection.distinct("cluster")

    fbdataCollection = getPageDataCollection()
    for cluster in clusterCollection.find():
        pageIds = cluster['pages'][:len(cluster['pages']) / 10]
        for p in pageIds:
            data = fbdataCollection.find_one({'_id': p})
            _id = data['_id']
            string = data['data']
            f = open('temp/' + _id + '.txt', 'w')
            f.write(string)
            f.close()

    for name in glob.glob('./temp/*.txt'):
        args.append(name)

    for a in args:
        x = defaultdict(float)