Example #1
0
def main():
	'''登录微博'''
	paramDict = read_config()
	if not login(paramDict['username'], paramDict['password']):
		exit()

	'''与数据库建立连接和指针'''
	pool = PooledDB(MySQLdb, int(paramDict['threadnum']),  host = paramDict['dbhost'], user = paramDict['dbuser'], passwd = paramDict['dbpasswd'], db = paramDict['dbname'])
	conn = pool.connection()
	cur = conn.cursor()

	'''读取未爬取的链接列表放入队列'''
	urlQLock = threading.Lock()
	tableName = 'users'
	sql = 'select id, uid from %s where isCrawled = 0' % tableName
	cur.execute(sql)
	result = cur.fetchall()
	urlQ = Queue(len(result))
	for entry in result:
		urlQ.put(entry)

	'''建立线程'''
	for i in xrange(int(paramDict['threadnum'])):
		thr = DownloadThread(pool, urlQ, urlQLock)
		threadPool.append(thr)
		thr.start()
	 
	'''检查是否存在结束的线程,若有,则重新建立新的线程'''
	while True:
		try:
			sleep(60)
			'''当队列为空时,跳出循环'''
			if not urlQ.qsize():
				break
			if threading.activeCount() < int(paramDict['threadnum']) + 1:
				'''检查哪个线程已经结束,将其清除'''
				i = 0
				for thr in threadPool:
					if not thr.isAlive():
						thr.clear()
						del threadPool[i]
						newThr = DownloadThread(pool, urlQ, urlQLock)
						threadPool.append(newThr)
						newThr.start()
					else:
						i += 1
		except:
			print sys.exc_info()[0]
			for thr in threadPool:
				thr.end()
			break
	print 'Main thread end!'
Example #2
0
def main():
	#Login Weibo. 
	paramDict = read_config()
	if not login(paramDict['username'], paramDict['password']):
		exit()

	#Crawl the report board. 
	'''pageTotal = 13653
	pageStart = 13651
	for pageNum in xrange(pageStart, pageTotal+1):
		url = 'http://service.account.weibo.com/index?type=0&status=4&page=%d' % pageNum
		page = download(url)
		dstFile = open(os.path.join('Data', str(pageNum)), 'w')
		dstFile.write(page)
		dstFile.close()
		print pageNum'''

	#Crawl the reports' mainpages. 
	#Connect to the database. 
	pool = PooledDB(MySQLdb, 1,  host = "localhost", user = "******", passwd = "123456", db = "abnormal")
	conn = pool.connection()
	cur = conn.cursor()

	#Read the list of URLs which haven't been crawled. 
	sql = 'select id, url from reportlinks where isCrawled=0'
	cur.execute(sql)
	for entry in cur.fetchall():
		#Dowload the page and write to the local file. 
		url = 'http://service.account.weibo.com%s' % entry[1]
		page = download(url)
		dstFile = open(os.path.join('Report', '%d_%s' % (entry[0], entry[1][10:])), 'w')
		dstFile.write(page)
		dstFile.close()
		#Update the URL's isCrawled flag. 
		sql = 'update reportlinks set isCrawled=1 where id=%d' % entry[0]
		cur.execute(sql)
		conn.commit()
		print entry[0], entry[1][10:]
	cur.close()
	conn.close()
Example #3
0
def main():
	#Login Weibo. 
	paramDict = read_config()
	if not login(paramDict['username'], paramDict['password']):
		exit()
	'''rid = 'K1CaJ6Q5d6ake'
	url = 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid
	#url = 'http://service.account.weibo.com%s' % entry[1]
	headers = {'Referer' : 'http://service.account.weibo.com/show?rid=%s' % rid, }
	page = download(url, headers)
	objFolder = 'Report_Detail'
	dstFile = open(os.path.join(objFolder, rid, 'Comments', '1'), 'w')
	dstFile.write(page)
	dstFile.close()
	soup = BeautifulSoup(page)
	scriptTag = soup.find_all('script')[1]
	sinceId = patternSinceId.search(scriptTag.string).group(1)
	shortUrl = patternShortUrl.search(scriptTag.string).group(1)
	commentCount = int(patternNumber.search(soup.find(class_='list_title').string).group())
	pageCount = commentCount / 10
	if commentCount % 10 != 0:
		pageCount += 1
	print commentCount, sinceId, shortUrl
	for i in xrange(2, pageCount+1):
		url = 'http://widget.weibo.com/distribution/aj_getcomments.php?since_id=%s&adminFlag=&appkey=689653874&short_url=%s&language=zh_cn&_t=0' % (sinceId, shortUrl)
		headers = {'Referer' : 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid, }
		page = download(url, headers)
		dstFile = open(os.path.join(objFolder, rid, 'Comments', str(i)), 'w')
		dstFile.write(page)
		dstFile.close()
		contentDict = eval(page)
		sinceId = contentDict['since_id']
		print i
		#soup = BeautifulSoup(page)'''
	'''f = open('Report_Detail/K1CaJ6Q5d6ake/Comments/2')
	content = f.read()
	f.close()
	contentDict = eval(content)
	out = open('Report_Detail/K1CaJ6Q5d6ake/Comments/tmp', 'w')
	out.write(contentDict['html'].replace('\/', '/'))
	out.close()
	print contentDict['since_id']'''
	
	#Crawl the reports' mainpages. 
	#Connect to the database. 
	pool = PooledDB(MySQLdb, 1,  host = "localhost", user = "******", passwd = "123456", db = "abnormal")
	conn = pool.connection()
	cur = conn.cursor()
	objFolder = 'Report_Detail'
	#Read the list of URLs which haven't been crawled. 
	sql = 'select id, url from reportlinks where isCrawled=0'
	cur.execute(sql)
	for entry in cur.fetchall():
		#Dowload the page and write to the local file. 
		rid = entry[1][10:]
		url = 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid
		#url = 'http://service.account.weibo.com%s' % entry[1]
		headers = {'Referer' : 'http://service.account.weibo.com/show?rid=%s' % rid, }
		page = download(url, headers)
		dstFile = open(os.path.join(objFolder, rid, 'Comments', '1'), 'w')
		dstFile.write(page)
		dstFile.close()
		soup = BeautifulSoup(page)
		scriptTag = soup.find_all('script')[1]
		sinceId = patternSinceId.search(scriptTag.string).group(1)
		shortUrl = patternShortUrl.search(scriptTag.string).group(1)
		commentCount = int(patternNumber.search(soup.find(class_='list_title').string).group())
		pageCount = commentCount / 10
		if commentCount % 10 != 0:
			pageCount += 1
		#print commentCount, sinceId, shortUrl
		for i in xrange(2, pageCount+1):
			url = 'http://widget.weibo.com/distribution/aj_getcomments.php?since_id=%s&adminFlag=&appkey=689653874&short_url=%s&language=zh_cn&_t=0' % (sinceId, shortUrl)
			headers = {'Referer' : 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid, }
			page = download(url, headers)
			dstFile = open(os.path.join(objFolder, rid, 'Comments', str(i)), 'w')
			dstFile.write(page)
			dstFile.close()
			contentDict = eval(page)
			sinceId = contentDict['since_id']
			print i
		#Update the URL's isCrawled flag. 
		sql = 'update reportlinks set isCrawled=1 where id=%d' % entry[0]
		cur.execute(sql)
		conn.commit()
		print entry[0], entry[1][10:], commentCount
	cur.close()
	conn.close()
Example #4
0
def main():
	from WeiboLogin import login
	paramDict = read_config()
	if not login(paramDict['username'], paramDict['password']):
		exit()
	download_user('1850235592')