Esempio n. 1
0
class GetLatestBlog(threading.Thread):
	def __init__(self, jobs_queue, results_queue, gsid, proxy=None):
		threading.Thread.__init__(self)
		self.jobs_queue = jobs_queue
		self.results_queue = results_queue
		self.gsid = gsid
		self.proxy = proxy
		self.wc = WeiboCrawler()
		self.wc.setGsid(self.gsid)
		self.wc.setProxy(self.proxy)

	def run(self):
		while True:
			time.sleep(random.randint(2, 4))

			uid, page = self.jobs_queue.get()
			self.jobs_queue.task_done()

			if page is None:
				page = "1"
			resp = self.wc.getMicroBlogs(uid, page)
			if resp is None:
				self.jobs_queue.put(uid)
			soup = BeautifulSoup(resp)
			body = soup.body
			mblogs = body.findAll("div", {"class": "c", "id": re.compile(u"M_")})
			if mblogs is None: # no micro blog
				continue
			#print mblogs
			blogs_file = open("%s/data/blogs/%s.blog" % (basepath, datetime.date.today()), "a")
			for mblog in mblogs:
				blogs_file.write("[%s]:%s\n" % (uid, mblog))
			blogs_file.close()
Esempio n. 2
0
 def getWeibos(self, keyword,  page=1, count=None):
     url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         infos = result['info'].decode('gb2312')
         soup = BeautifulSoup(infos)
         total_soup = soup.select('.headerR1')[0]
         total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip()
         return_val = {'total_count': int(total_num), 'msgs':[]}
         allmsgs = []
         msgs_soup = soup.select('.nr_con')
         for msg_soup in msgs_soup:
             avatar =  'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href')
             nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split(':')
             nickname = nickandtext[0]
             text = nickandtext[1]
             ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text()
             allmsgs.append({
                 'avatar': avatar,
                 'nickname': nickname,
                 'text': text,
                 'datetime': ts,
                 })
         return_val['msgs'] = allmsgs
         return return_val
Esempio n. 3
0
def main():
	results_queue = Queue.Queue()
	jobs_queue = Queue.Queue()

	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()

	gsid, proxy = accounts[0][0], accounts[0][1]
	if proxy == "None":
		proxy = None
	wc.setGsid(gsid)
	wc.setProxy(proxy)

	res = wc.getMicroBlogs("1646194541")
	soup = BeautifulSoup(res)
	pagelist = soup.find("div", {"id": "pagelist"})
	mp = pagelist.find("input", {"name": "mp"})

	uid = "xxxxxxxxx"
	for page in range(1, int(mp) + 1):
		jobs_queue.put((uid, page))
	for account in accounts:
		gsid = account[0]
		proxy = account[1]
		if proxy == "None":
			proxy = None
		glb = GetLatestBlog(jobs_queue, results_queue, gsid, proxy)
		glb.setDaemon(True)
		glb.start()

	jobs_queue.join()
Esempio n. 4
0
	def __init__(self, jobs_queue, results_queue, gsid, proxy=None):
		threading.Thread.__init__(self)
		self.jobs_queue = jobs_queue
		self.results_queue = results_queue
		self.gsid = gsid
		self.proxy = proxy
		self.wc = WeiboCrawler()
		self.wc.setGsid(self.gsid)
		self.wc.setProxy(self.proxy)
 def getWeibos(self, keyword,  page=1, size=10, sid=None):
     if not sid:
         sid = self.sid
     url = 'http://ti.3g.qq.com/touch/s?sid=%s&aid=vaction&more=1&mst=33&ac=60&keyword=%s&dl2=1&dumpJSON=1&pageid=search&pid=%d&psize=%d' % (sid, keyword, page, size)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         json_info = json.loads(result['info'])
         if 'result' in json_info and json_info['result'] == '0':
             msgs = json_info['jsonDump']['msgs']
             total_info = json_info['info']
             return {'msgs': msgs, 'total_pages': total_info['pageCount'], 'total_count': total_info['totalCount']}
 def getSid(self):
     url = 'http://pt.3g.qq.com/login?act=json&format=2&bid_code=microblogLogin&r=%f&qq=%s&pmd5=%s&go_url=http://ti.3g.qq.com/touch/iphone/index.jsp?g_f=18106' % (random.random(), self.username, self.password)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         info = result['info'].replace('pt.handleLoginResult(', '')[:-2]
         json_info = json.loads(info)
         if len(json_info) == 8:
             sid = json_info[4]
             self.sid = sid
             return sid
     return None
 def getWeibos(self, keyword,  page=1, count=10):
     url = 'http://m.weibo.cn/searchs/weibo?key=%s&page=%d&count=%d' % (keyword, page, count)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         infos = result['info']
         json_infos = json.loads(infos)
         if 'ok' in json_infos and json_infos['ok']:
             return_val = {'total_count': json_infos['total_number'], 'total_pages': json_infos['maxPage'], 'msgs': []}
             msgs = json_infos['mblogList']
             return_val['msgs'] = msgs
             return return_val
 def getSid(self):
     url = 'http://pt.3g.qq.com/login?act=json&format=2&bid_code=microblogLogin&r=%f&qq=%s&pmd5=%s&go_url=http://ti.3g.qq.com/touch/iphone/index.jsp?g_f=18106' % (
         random.random(), self.username, self.password)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         info = result['info'].replace('pt.handleLoginResult(', '')[:-2]
         json_info = json.loads(info)
         if len(json_info) == 8:
             sid = json_info[4]
             self.sid = sid
             return sid
     return None
Esempio n. 9
0
 def getWeibos(self, keyword, page=1, count=10):
     url = 'http://m.weibo.cn/searchs/weibo?key=%s&page=%d&count=%d' % (
         keyword, page, count)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         infos = result['info']
         json_infos = json.loads(infos)
         if 'ok' in json_infos and json_infos['ok']:
             return_val = {
                 'total_count': json_infos['total_number'],
                 'total_pages': json_infos['maxPage'],
                 'msgs': []
             }
             msgs = json_infos['mblogList']
             return_val['msgs'] = msgs
             return return_val
 def getWeibos(self, keyword, page=1, size=10, sid=None):
     if not sid:
         sid = self.sid
     url = 'http://ti.3g.qq.com/touch/s?sid=%s&aid=vaction&more=1&mst=33&ac=60&keyword=%s&dl2=1&dumpJSON=1&pageid=search&pid=%d&psize=%d' % (
         sid, keyword, page, size)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         json_info = json.loads(result['info'])
         if 'result' in json_info and json_info['result'] == '0':
             msgs = json_info['jsonDump']['msgs']
             total_info = json_info['info']
             return {
                 'msgs': msgs,
                 'total_pages': total_info['pageCount'],
                 'total_count': total_info['totalCount']
             }
Esempio n. 11
0
class StatusUpdater(threading.Thread):
	def __init__(self, jobs):
		threading.Thread.__init__(self)
		self.wc = WeiboCrawler()
		self.jobs = jobs

	def run(self):
		if self.jobs.empty():
			return
		gsid, proxy, content, image_info = self.jobs.get()
		self.jobs.task_done()
		self.setGsid(gsid)
		self.setProxy(proxy)

		#self.wc.sendBlog(content, image_info)
		#self.wc.setBirthday("1987", "4", "30")
		#self.wc.setAvatar("2.gif", "image/gif", "2.gif")
		time.sleep(2)
		

	def setGsid(self, gsid):
		if gsid is None:
			return
		self.wc.setGsid(gsid)
		self.wc.is_login = True

	def setProxy(self, proxy):
		if proxy is None:
			return
		self.wc.setProxy(proxy)

	def login(username, password, proxy=None):
		self.wc.login(username, password, proxy)

	def isLogin(self):
		return self.wc.is_login
Esempio n. 12
0
#						users.append(user_doc)

					with open("baby_search_result.dat", "a") as sr:
						sr.write("%s\n" % user_doc)
				break
	
			self.jobs.task_done()


if __name__ == "__main__":
	results_queue = Queue.Queue()
#	pj = PersistJobs(results_queue)
#	pj.setDaemon(True)
#	pj.start()

	crawler = WeiboCrawler()
	accounts_list = crawler.getAllGsidProxyPair()

	with open(sys.argv[1].strip()) as tasks:
		for task in tasks:
			if len(task.strip()) == 0:
				continue
			items = task.split("|")
			keyword = items[0].strip()
			year = int(items[1].strip())
			month = int(items[2].strip())
			day = int(items[3].strip())
			days = int(items[4].strip())
			keywords = Queue.Queue()
			jobs = Queue.Queue()
			start = datetime.date(year, month, day)
Esempio n. 13
0
	def __init__(self, jobs):
		threading.Thread.__init__(self)
		self.wc = WeiboCrawler()
		self.jobs = jobs
Esempio n. 14
0
def main():
	try:
		connect = Connection(host="localhost", port=27017)
		print "Connected Successfully"
	except ConnectionFailure, e:
		sys.stderr.write("Could not connect to MongoDB: %s" % e)
		sys.exit(1)
	# Get a Database handle
	dbh = connect['weibo']

	fans = Queue.Queue()
	atts = Queue.Queue()
	jobs = Queue.Queue()


	wc = WeiboCrawler()
	accounts = wc.getAllGsidProxyPair()
	for account in accounts:
		gsid, proxy = account[0], account[1]
		print gsid, proxy
		wct = WeiboCrawler(gsid=gsid, proxy=proxy)
		wct.setGsid(gsid)
		wct.setProxy(proxy)
		wsg = WeiboSocialGraph(jobs, dbh, wct)
		wsg.setDaemon(True)
		wsg.start()

		paj = PrepareAttsJobs(atts, jobs, dbh, wct)
		paj.setDaemon(True)
		paj.start()
Esempio n. 15
0
	def __init__(self, username, password, proxy=None):
		self.wc = WeiboCrawler()
		self.wc.login(username, password, proxy)
		self.is_login = False
		if self.wc.is_login:
			self.is_login = True
Esempio n. 16
0
class ProfileFiller:
	def __init__(self, username, password, proxy=None):
		self.wc = WeiboCrawler()
		self.wc.login(username, password, proxy)
		self.is_login = False
		if self.wc.is_login:
			self.is_login = True

	def isLogin(self):
		return self.is_login

	def fillProfile(self, kwds):
		nick = kwds['nick']
		self.wc.setNick(nick)

		domain = kwds['domain']
		self.wc.setDomain(domain)

		description = kwds['description']
		self.wc.setDescription(description)

		tag = kwds['tag']
		self.wc.setTag(tag)

		gender = kwds['gender']
		self.wc.setGender(gender)

		provid, cityid = kwds['location']['provid'], kwds['location']['cityid']
		self.wc.setLocation(provid, cityid)

		schoolid, inyear, department = kwds['school']['id'], kwds['school']['in'], kwds['school']['department']
		self.wc.setSchool(schoolid, inyear, department)

		companyname, inyear, outyear, department = kwds['company']['name'], kwds['company']['in'], kwds['company']['out'], kwds['company']['department']
		self.wc.setCompany(companyname, inyear, outyear, department)

		year, month, day = kwds['birth']['year'], kwds['birth']['month'], kwds['birth']['day']