def main(): '''登录微博''' paramDict = read_config() if not login(paramDict['username'], paramDict['password']): exit() '''与数据库建立连接和指针''' pool = PooledDB(MySQLdb, int(paramDict['threadnum']), host = paramDict['dbhost'], user = paramDict['dbuser'], passwd = paramDict['dbpasswd'], db = paramDict['dbname']) conn = pool.connection() cur = conn.cursor() '''读取未爬取的链接列表放入队列''' urlQLock = threading.Lock() tableName = 'users' sql = 'select id, uid from %s where isCrawled = 0' % tableName cur.execute(sql) result = cur.fetchall() urlQ = Queue(len(result)) for entry in result: urlQ.put(entry) '''建立线程''' for i in xrange(int(paramDict['threadnum'])): thr = DownloadThread(pool, urlQ, urlQLock) threadPool.append(thr) thr.start() '''检查是否存在结束的线程,若有,则重新建立新的线程''' while True: try: sleep(60) '''当队列为空时,跳出循环''' if not urlQ.qsize(): break if threading.activeCount() < int(paramDict['threadnum']) + 1: '''检查哪个线程已经结束,将其清除''' i = 0 for thr in threadPool: if not thr.isAlive(): thr.clear() del threadPool[i] newThr = DownloadThread(pool, urlQ, urlQLock) threadPool.append(newThr) newThr.start() else: i += 1 except: print sys.exc_info()[0] for thr in threadPool: thr.end() break print 'Main thread end!'
def main(): #Login Weibo. paramDict = read_config() if not login(paramDict['username'], paramDict['password']): exit() #Crawl the report board. '''pageTotal = 13653 pageStart = 13651 for pageNum in xrange(pageStart, pageTotal+1): url = 'http://service.account.weibo.com/index?type=0&status=4&page=%d' % pageNum page = download(url) dstFile = open(os.path.join('Data', str(pageNum)), 'w') dstFile.write(page) dstFile.close() print pageNum''' #Crawl the reports' mainpages. #Connect to the database. pool = PooledDB(MySQLdb, 1, host = "localhost", user = "******", passwd = "123456", db = "abnormal") conn = pool.connection() cur = conn.cursor() #Read the list of URLs which haven't been crawled. sql = 'select id, url from reportlinks where isCrawled=0' cur.execute(sql) for entry in cur.fetchall(): #Dowload the page and write to the local file. url = 'http://service.account.weibo.com%s' % entry[1] page = download(url) dstFile = open(os.path.join('Report', '%d_%s' % (entry[0], entry[1][10:])), 'w') dstFile.write(page) dstFile.close() #Update the URL's isCrawled flag. sql = 'update reportlinks set isCrawled=1 where id=%d' % entry[0] cur.execute(sql) conn.commit() print entry[0], entry[1][10:] cur.close() conn.close()
def main(): #Login Weibo. paramDict = read_config() if not login(paramDict['username'], paramDict['password']): exit() '''rid = 'K1CaJ6Q5d6ake' url = 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid #url = 'http://service.account.weibo.com%s' % entry[1] headers = {'Referer' : 'http://service.account.weibo.com/show?rid=%s' % rid, } page = download(url, headers) objFolder = 'Report_Detail' dstFile = open(os.path.join(objFolder, rid, 'Comments', '1'), 'w') dstFile.write(page) dstFile.close() soup = BeautifulSoup(page) scriptTag = soup.find_all('script')[1] sinceId = patternSinceId.search(scriptTag.string).group(1) shortUrl = patternShortUrl.search(scriptTag.string).group(1) commentCount = int(patternNumber.search(soup.find(class_='list_title').string).group()) pageCount = commentCount / 10 if commentCount % 10 != 0: pageCount += 1 print commentCount, sinceId, shortUrl for i in xrange(2, pageCount+1): url = 'http://widget.weibo.com/distribution/aj_getcomments.php?since_id=%s&adminFlag=&appkey=689653874&short_url=%s&language=zh_cn&_t=0' % (sinceId, shortUrl) headers = {'Referer' : 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid, } page = download(url, headers) dstFile = open(os.path.join(objFolder, rid, 'Comments', str(i)), 'w') dstFile.write(page) dstFile.close() contentDict = eval(page) sinceId = contentDict['since_id'] print i #soup = BeautifulSoup(page)''' '''f = open('Report_Detail/K1CaJ6Q5d6ake/Comments/2') content = f.read() f.close() contentDict = eval(content) out = open('Report_Detail/K1CaJ6Q5d6ake/Comments/tmp', 'w') out.write(contentDict['html'].replace('\/', '/')) out.close() print contentDict['since_id']''' #Crawl the reports' mainpages. #Connect to the database. pool = PooledDB(MySQLdb, 1, host = "localhost", user = "******", passwd = "123456", db = "abnormal") conn = pool.connection() cur = conn.cursor() objFolder = 'Report_Detail' #Read the list of URLs which haven't been crawled. sql = 'select id, url from reportlinks where isCrawled=0' cur.execute(sql) for entry in cur.fetchall(): #Dowload the page and write to the local file. rid = entry[1][10:] url = 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid #url = 'http://service.account.weibo.com%s' % entry[1] headers = {'Referer' : 'http://service.account.weibo.com/show?rid=%s' % rid, } page = download(url, headers) dstFile = open(os.path.join(objFolder, rid, 'Comments', '1'), 'w') dstFile.write(page) dstFile.close() soup = BeautifulSoup(page) scriptTag = soup.find_all('script')[1] sinceId = patternSinceId.search(scriptTag.string).group(1) shortUrl = patternShortUrl.search(scriptTag.string).group(1) commentCount = int(patternNumber.search(soup.find(class_='list_title').string).group()) pageCount = commentCount / 10 if commentCount % 10 != 0: pageCount += 1 #print commentCount, sinceId, shortUrl for i in xrange(2, pageCount+1): url = 'http://widget.weibo.com/distribution/aj_getcomments.php?since_id=%s&adminFlag=&appkey=689653874&short_url=%s&language=zh_cn&_t=0' % (sinceId, shortUrl) headers = {'Referer' : 'http://widget.weibo.com/distribution/comments.php?width=0&url=http://service.account.weibo.com/show?rid=%s&ralateuid=3097939193&appkey=689653874' % rid, } page = download(url, headers) dstFile = open(os.path.join(objFolder, rid, 'Comments', str(i)), 'w') dstFile.write(page) dstFile.close() contentDict = eval(page) sinceId = contentDict['since_id'] print i #Update the URL's isCrawled flag. sql = 'update reportlinks set isCrawled=1 where id=%d' % entry[0] cur.execute(sql) conn.commit() print entry[0], entry[1][10:], commentCount cur.close() conn.close()
def main(): from WeiboLogin import login paramDict = read_config() if not login(paramDict['username'], paramDict['password']): exit() download_user('1850235592')