Esempio n. 1
0
class GetLatestBlog(threading.Thread):
	def __init__(self, jobs_queue, results_queue, gsid, proxy=None):
		threading.Thread.__init__(self)
		self.jobs_queue = jobs_queue
		self.results_queue = results_queue
		self.gsid = gsid
		self.proxy = proxy
		self.wc = WeiboCrawler()
		self.wc.setGsid(self.gsid)
		self.wc.setProxy(self.proxy)

	def run(self):
		while True:
			time.sleep(random.randint(2, 4))

			uid, page = self.jobs_queue.get()
			self.jobs_queue.task_done()

			if page is None:
				page = "1"
			resp = self.wc.getMicroBlogs(uid, page)
			if resp is None:
				self.jobs_queue.put(uid)
			soup = BeautifulSoup(resp)
			body = soup.body
			mblogs = body.findAll("div", {"class": "c", "id": re.compile(u"M_")})
			if mblogs is None: # no micro blog
				continue
			#print mblogs
			blogs_file = open("%s/data/blogs/%s.blog" % (basepath, datetime.date.today()), "a")
			for mblog in mblogs:
				blogs_file.write("[%s]:%s\n" % (uid, mblog))
			blogs_file.close()
Esempio n. 2
0
def main():
	results_queue = Queue.Queue()
	jobs_queue = Queue.Queue()

	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()

	gsid, proxy = accounts[0][0], accounts[0][1]
	if proxy == "None":
		proxy = None
	wc.setGsid(gsid)
	wc.setProxy(proxy)

	res = wc.getMicroBlogs("1646194541")
	soup = BeautifulSoup(res)
	pagelist = soup.find("div", {"id": "pagelist"})
	mp = pagelist.find("input", {"name": "mp"})

	uid = "xxxxxxxxx"
	for page in range(1, int(mp) + 1):
		jobs_queue.put((uid, page))
	for account in accounts:
		gsid = account[0]
		proxy = account[1]
		if proxy == "None":
			proxy = None
		glb = GetLatestBlog(jobs_queue, results_queue, gsid, proxy)
		glb.setDaemon(True)
		glb.start()

	jobs_queue.join()