Esempio n. 1
0
def main():
	results_queue = Queue.Queue()
	jobs_queue = Queue.Queue()

	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()

	gsid, proxy = accounts[0][0], accounts[0][1]
	if proxy == "None":
		proxy = None
	wc.setGsid(gsid)
	wc.setProxy(proxy)

	res = wc.getMicroBlogs("1646194541")
	soup = BeautifulSoup(res)
	pagelist = soup.find("div", {"id": "pagelist"})
	mp = pagelist.find("input", {"name": "mp"})

	uid = "xxxxxxxxx"
	for page in range(1, int(mp) + 1):
		jobs_queue.put((uid, page))
	for account in accounts:
		gsid = account[0]
		proxy = account[1]
		if proxy == "None":
			proxy = None
		glb = GetLatestBlog(jobs_queue, results_queue, gsid, proxy)
		glb.setDaemon(True)
		glb.start()

	jobs_queue.join()
Esempio n. 2
0
					with open("baby_search_result.dat", "a") as sr:
						sr.write("%s\n" % user_doc)
				break
	
			self.jobs.task_done()


if __name__ == "__main__":
	results_queue = Queue.Queue()
#	pj = PersistJobs(results_queue)
#	pj.setDaemon(True)
#	pj.start()

	crawler = WeiboCrawler()
	accounts_list = crawler.getAllGsidProxyPair()

	with open(sys.argv[1].strip()) as tasks:
		for task in tasks:
			if len(task.strip()) == 0:
				continue
			items = task.split("|")
			keyword = items[0].strip()
			year = int(items[1].strip())
			month = int(items[2].strip())
			day = int(items[3].strip())
			days = int(items[4].strip())
			keywords = Queue.Queue()
			jobs = Queue.Queue()
			start = datetime.date(year, month, day)
			print year, month, day
Esempio n. 3
0
            for i in range(1, maxPage + 1):
                self.jobs_queue.put((mid, i, 0))
            self.mid_queue.task_done()





if __name__ == "__main__":
    mid_queue = Queue.Queue()
    jobs_queue = Queue.Queue()
    
    mid_queue.put(("y3FIBFbE0", 0)) # test
    
    crawler = WeiboCrawler()
    accounts = crawler.getAllGsidProxyPair()
    
    account_list = []
    for account in accounts:
        gsid, proxy = account[0], account[1]
        if proxy == "None":
        	proxy = None
        c = WeiboCrawler()
        c.setGsid(gsid)
        c.setProxy(proxy)
        ijq = InitJobQueue(c, mid_queue, jobs_queue)
        ijq.setDaemon(True)
        ijq.start()
        account_list.append(c)
        
	mid_queue.join()
Esempio n. 4
0
	try:
		connect = Connection(host="localhost", port=27017)
		print "Connected Successfully"
	except ConnectionFailure, e:
		sys.stderr.write("Could not connect to MongoDB: %s" % e)
		sys.exit(1)
	# Get a Database handle
	dbh = connect['weibo']

	fans = Queue.Queue()
	atts = Queue.Queue()
	jobs = Queue.Queue()


	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()
	for account in accounts:
		gsid, proxy = account[0], account[1]
		print gsid, proxy
		wct = WeiboCrawler(gsid=gsid, proxy=proxy)
		wct.setGsid(gsid)
		wct.setProxy(proxy)
		wsg = WeiboSocialGraph(jobs, dbh, wct)
		wsg.setDaemon(True)
		wsg.start()

		paj = PrepareAttsJobs(atts, jobs, dbh, wct)
		paj.setDaemon(True)
		paj.start()