Esempio n. 1
0
class GetLatestBlog(threading.Thread):
	def __init__(self, jobs_queue, results_queue, gsid, proxy=None):
		threading.Thread.__init__(self)
		self.jobs_queue = jobs_queue
		self.results_queue = results_queue
		self.gsid = gsid
		self.proxy = proxy
		self.wc = WeiboCrawler()
		self.wc.setGsid(self.gsid)
		self.wc.setProxy(self.proxy)

	def run(self):
		while True:
			time.sleep(random.randint(2, 4))

			uid, page = self.jobs_queue.get()
			self.jobs_queue.task_done()

			if page is None:
				page = "1"
			resp = self.wc.getMicroBlogs(uid, page)
			if resp is None:
				self.jobs_queue.put(uid)
			soup = BeautifulSoup(resp)
			body = soup.body
			mblogs = body.findAll("div", {"class": "c", "id": re.compile(u"M_")})
			if mblogs is None: # no micro blog
				continue
			#print mblogs
			blogs_file = open("%s/data/blogs/%s.blog" % (basepath, datetime.date.today()), "a")
			for mblog in mblogs:
				blogs_file.write("[%s]:%s\n" % (uid, mblog))
			blogs_file.close()
Esempio n. 2
0
def main():
	results_queue = Queue.Queue()
	jobs_queue = Queue.Queue()

	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()

	gsid, proxy = accounts[0][0], accounts[0][1]
	if proxy == "None":
		proxy = None
	wc.setGsid(gsid)
	wc.setProxy(proxy)

	res = wc.getMicroBlogs("1646194541")
	soup = BeautifulSoup(res)
	pagelist = soup.find("div", {"id": "pagelist"})
	mp = pagelist.find("input", {"name": "mp"})

	uid = "xxxxxxxxx"
	for page in range(1, int(mp) + 1):
		jobs_queue.put((uid, page))
	for account in accounts:
		gsid = account[0]
		proxy = account[1]
		if proxy == "None":
			proxy = None
		glb = GetLatestBlog(jobs_queue, results_queue, gsid, proxy)
		glb.setDaemon(True)
		glb.start()

	jobs_queue.join()
Esempio n. 3
0
class StatusUpdater(threading.Thread):
	def __init__(self, jobs):
		threading.Thread.__init__(self)
		self.wc = WeiboCrawler()
		self.jobs = jobs

	def run(self):
		if self.jobs.empty():
			return
		gsid, proxy, content, image_info = self.jobs.get()
		self.jobs.task_done()
		self.setGsid(gsid)
		self.setProxy(proxy)

		#self.wc.sendBlog(content, image_info)
		#self.wc.setBirthday("1987", "4", "30")
		#self.wc.setAvatar("2.gif", "image/gif", "2.gif")
		time.sleep(2)
		

	def setGsid(self, gsid):
		if gsid is None:
			return
		self.wc.setGsid(gsid)
		self.wc.is_login = True

	def setProxy(self, proxy):
		if proxy is None:
			return
		self.wc.setProxy(proxy)

	def login(username, password, proxy=None):
		self.wc.login(username, password, proxy)

	def isLogin(self):
		return self.wc.is_login
Esempio n. 4
0
				datestr = date.strftime("%Y%m%d")
				#print datestr
				#print keyword
				keywords.put((keyword.strip(), datestr))
				date = date + delta
			
			objects = []
			for account in accounts_list:
				gsid = account[0]
				if account[1] == "None":
					proxy = None
				else: 
					proxy = account[1] 
				c = WeiboCrawler()
				c.setGsid(gsid)
				c.setProxy(proxy)
				objects.append(c)
				kw = KeyWord(c, keywords, jobs)
				kw.setDaemon(True)
				kw.start()
			
			keywords.join()
			
			for crawler in objects:
				s = Search(crawler, jobs, results_queue)
				s.setDaemon(True)
				s.start()
				
			jobs.join()
			logger.info("[%s] Done!" % task)
Esempio n. 5
0
	# Get a Database handle
	dbh = connect['weibo']

	fans = Queue.Queue()
	atts = Queue.Queue()
	jobs = Queue.Queue()


	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()
	for account in accounts:
		gsid, proxy = account[0], account[1]
		print gsid, proxy
		wct = WeiboCrawler(gsid=gsid, proxy=proxy)
		wct.setGsid(gsid)
		wct.setProxy(proxy)
		wsg = WeiboSocialGraph(jobs, dbh, wct)
		wsg.setDaemon(True)
		wsg.start()

		paj = PrepareAttsJobs(atts, jobs, dbh, wct)
		paj.setDaemon(True)
		paj.start()


	while True:
		users = dbh.user.find({"checkAtts":0}, {"_id":1}, limit=10, sort=[("iTime", ASCENDING)])
#		users = dbh.user.find({"checkFans":0, "checkAtts":0, "fans":{"$lte": 5000}}, {"_id":1}, limit=10, sort=[("iTime", ASCENDING)])
		for user in users:
			print user['_id']
			logger.info("[job][%s]" % user['_id'])
Esempio n. 6
0
				js = json.loads(res)
				if int(js['ok']) == 1:
					with open("%s/data/profile/%s.info" % (basepath, datetime.date.today()), "a") as profilef:
						profilef.write("%s\n" % res)
					self.jobs_queue.task_done()
				else:
					raise Exception("Unknown Excpetion")
			except Exception, e:
				logger.info(url)
				logger.info(res)
				logger.error(e)
				number = number + 1
				self.jobs_queue.put((uid, number))
				self.jobs_queue.task_done()
				continue

if __name__ == "__main__":
	jobs_queue = Queue.Queue()
	jobs_queue.put(("1804147667", 0))
	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()
	gsid, proxy = accounts[0]
	if proxy == "None":
		proxy = None
	print gsid, proxy
	wc.setGsid(gsid)
	wc.setProxy(proxy)
	gp = GetProfile(wc, jobs_queue)
	gp.setDaemon(False)
	gp.start()