def run(self): global FINISH global page_count for i in range(RETRY): try: result = get_request( '%s/Forums/Statistics/?Page=%s' % (DOMAIN, self.page_num), cookie=self.cookie) if result['status']: content = tryutf8(result['content']) soup = BeautifulSoup(content) status_rows = soup.find_all('div', class_='StatisticsRow') for status in status_rows: b = BeautifulSoup(str(status)) b_tag = b.find_all('p', class_='Number') works = [] if len(b_tag) == 9: works = [b.string for b in b_tag] works[0] = works[0][1:-1] threadLock.acquire() WORKS[b_tag[0].string] = works threadLock.release() break except Exception, e: print self.page_num, e
#-*- coding: utf-8 -*- import urllib2 import re from mccblackteck import get_request, login, regex_find, try_get_value, write_file from account import EMAIL, PWD cookie = login(EMAIL, PWD) if cookie is None: print "Login fail" else: write_file( 'estate.txt', 'w+', 'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发') for i in range(1, 50000): result = get_request('http://civitas.soobb.com/Estates/' + str(i) + '/Details/', cookie=cookie) if result['status']: content = result['content'] estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = '' estateDevelop = estateArea = estatePeople = {} # Check ID idList = regex_find( r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">', content) if len(idList) == 1: id = re.compile( r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)' ).sub(r'\2', idList[0]) if str(id) != str(i): continue
except Exception, e: print self.page_num, e threadLock.acquire() FINISH += 1 print_processbar(page_count, FINISH) threadLock.release() cookie = login(EMAIL, PWD) if cookie is None: print "Login fail" else: if not os.path.exists('works'): os.makedirs('works') # get pageCount result = get_request( "%s/Forums/Statistics/" % (DOMAIN), cookie=cookie) if result['status']: content = result['content'] pagination = BeautifulSoup(content).find_all( 'div', class_='Pagination') if pagination: page_count = int( BeautifulSoup(str(pagination[0])).find_all('a', href=True)[-2].string) threads = [] for i in range(1, page_count + 1): threads.append(getWorks(i, cookie)) for thread in threads: thread.start() while True: if(len(threading.enumerate()) < THREAD_SIZE): break
def run(self): global FINISH global page_count for i in range(SPEECH_RETRY): try: result = get_request( '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % (DOMAIN, self.speech_type, self.page_num), cookie=self.cookie) if result['status']: content = result['content'] content = tryutf8(content) soup = BeautifulSoup(content) speech_items = soup.find_all('div', class_='Speech') for speech_item in speech_items: speech = {} b = BeautifulSoup(str(speech_item)) # day and time day_sign_str = '本地演讲,第'.decode('utf-8') day_tag = b.find_all('p', text=re.compile(r'%s.+' % day_sign_str)) if day_tag: day_tag = day_tag[0] result = regex_find(r'[0-9]+', day_tag.string) # day speech['day'] = int(result[0]) # hour speech['hour'] = int(result[1]) # minute speech['minute'] = int(result[2]) # speech id speechId = b.find_all('div', class_='Speech')[0]['speechid'] speech['id'] = speechId # author speech['author'] = tryutf8( b.find_all('a', class_='WithEntityCard', href=True)[1].string) # content speech['content'] = tryutf8( b.find_all('p', class_='')[0].text) # Links hrefs = b.find_all('a', class_='', href=True) if hrefs: for href in hrefs: href_text = href.string # tag if href_text[0] == '#': speech['tag'] = tryutf8(href_text[1:-1]) # link elif href_text[0] == '(': if not 'links' in speech: speech['links'] = [] speech['links'].append([ tryutf8(href_text[1:-1]), href['href'] ]) # like like_tag = b.find_all('span', class_='Number', type='1') if like_tag: like_tag = like_tag[0] speech['like'] = int(like_tag.string[1:-1]) # watch watch_tag = b.find_all('span', class_='Number', type='3') if watch_tag: watch_tag = watch_tag[0] speech['watch'] = int(watch_tag.string[1:-1]) # dislike dislike_tag = b.find_all('span', class_='Number', type='2') if dislike_tag: dislike_tag = dislike_tag[0] speech['dislike'] = int(dislike_tag.string[1:-1]) threadLock.acquire() # print speechId, speech['content'] SPEECHES[speechId] = speech threadLock.release() break except Exception, e: print self.page_num, e
#-*- coding: utf-8 -*- import urllib2 import re from mccblackteck import get_request, login, regex_find, try_get_value, write_file from account import EMAIL, PWD cookie = login(EMAIL, PWD) if cookie is None: print "Login fail" else: write_file('estate.txt', 'w+', 'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发') for i in range(1, 50000): result = get_request( 'http://civitas.soobb.com/Estates/' + str(i) + '/Details/', cookie=cookie) if result['status']: content = result['content'] estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = '' estateDevelop = estateArea = estatePeople = {} # Check ID idList = regex_find( r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">', content) if len(idList) == 1: id = re.compile( r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)').sub(r'\2', idList[0]) if str(id) != str(i): continue # Estate name nameList = regex_find( r'<a href="/Estates/[0-9]+/Details/">.+</a>', content) if len(nameList) == 1:
print self.page_num, e threadLock.acquire() FINISH += 1 print_processbar(page_count, FINISH) threadLock.release() cookie = login(EMAIL, PWD) if cookie is None: print "Login fail" else: if not os.path.exists('speech'): os.makedirs('speech') # get pageCount result = get_request("%s/Forums/Speeches/?SpeechType=%s" % (DOMAIN, SPEECH_TYPE), cookie=cookie) if result['status']: content = result['content'] pagination = BeautifulSoup(content).find_all('div', class_='Pagination') if pagination: page_count = int( BeautifulSoup(str(pagination[0])).find_all( 'a', href=True)[-2].string) threads = [] for i in range(1, page_count + 1): threads.append(getSpeech(i, cookie)) for thread in threads: thread.start() while True:
def run(self): global FINISH global page_count for i in range(SPEECH_RETRY): try: result = get_request( '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % ( DOMAIN, self.speech_type, self.page_num), cookie=self.cookie) if result['status']: content = result['content'] content = tryutf8(content) soup = BeautifulSoup(content) speech_items = soup.find_all('div', class_='Speech') for speech_item in speech_items: speech = {} b = BeautifulSoup(str(speech_item)) # day and time day_sign_str = '本地演讲,第'.decode('utf-8') day_tag = b.find_all( 'p', text=re.compile(r'%s.+' % day_sign_str)) if day_tag: day_tag = day_tag[0] result = regex_find(r'[0-9]+', day_tag.string) # day speech['day'] = int(result[0]) # hour speech['hour'] = int(result[1]) # minute speech['minute'] = int(result[2]) # speech id speechId = b.find_all( 'div', class_='Speech')[0]['speechid'] speech['id'] = speechId # author speech['author'] = tryutf8( b.find_all('a', class_='WithEntityCard', href=True)[1].string) # content speech['content'] = tryutf8( b.find_all('p', class_='')[0].text) # Links hrefs = b.find_all('a', class_='', href=True) if hrefs: for href in hrefs: href_text = href.string # tag if href_text[0] == '#': speech['tag'] = tryutf8(href_text[1:-1]) # link elif href_text[0] == '(': if not 'links' in speech: speech['links'] = [] speech['links'].append( [tryutf8(href_text[1:-1]), href['href']]) # like like_tag = b.find_all( 'span', class_='Number', type='1') if like_tag: like_tag = like_tag[0] speech['like'] = int(like_tag.string[1:-1]) # watch watch_tag = b.find_all( 'span', class_='Number', type='3') if watch_tag: watch_tag = watch_tag[0] speech['watch'] = int(watch_tag.string[1:-1]) # dislike dislike_tag = b.find_all( 'span', class_='Number', type='2') if dislike_tag: dislike_tag = dislike_tag[0] speech['dislike'] = int(dislike_tag.string[1:-1]) threadLock.acquire() # print speechId, speech['content'] SPEECHES[speechId] = speech threadLock.release() break except Exception, e: print self.page_num, e
except Exception, e: print self.page_num, e threadLock.acquire() FINISH += 1 print_processbar(page_count, FINISH) threadLock.release() cookie = login(EMAIL, PWD) if cookie is None: print "Login fail" else: if not os.path.exists('speech'): os.makedirs('speech') # get pageCount result = get_request( "%s/Forums/Speeches/?SpeechType=%s" % (DOMAIN, SPEECH_TYPE), cookie=cookie) if result['status']: content = result['content'] pagination = BeautifulSoup(content).find_all( 'div', class_='Pagination') if pagination: page_count = int( BeautifulSoup(str(pagination[0])).find_all('a', href=True)[-2].string) threads = [] for i in range(1, page_count + 1): threads.append(getSpeech(i, cookie)) for thread in threads: thread.start() while True: if(len(threading.enumerate()) < THREAD_SIZE): break
#-*- coding: utf-8 -*- import re from mccblackteck import tryutf8, get_request, login, regex_find, DOMAIN from account import EMAIL, PWD import datetime import os import xlwt from bs4 import BeautifulSoup cookie = login(EMAIL, PWD) district = [] if cookie is None: print "Login fail" else: square_result = get_request( DOMAIN + '/Forums/', cookie=cookie) if square_result['status']: file = xlwt.Workbook(encoding='utf-8') square_content = square_result['content'] soup = BeautifulSoup(square_content) # city city_name = tryutf8( soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0]) # day day_number = int(regex_find( r'[0-9]+', BeautifulSoup(str(soup.find_all('div', class_='Clock')[0])).find_all('p')[1].string)[0]) # district soup_districts = soup.find_all('div', class_='District', href=True) for district_item in soup_districts: soup_district_tag = BeautifulSoup(str(district_item)).find_all( 'a', class_=False, href=True)[0]
#-*- coding: utf-8 -*- import re from mccblackteck import tryutf8, get_request, login, regex_find, DOMAIN from account import EMAIL, PWD import datetime import os import xlwt from bs4 import BeautifulSoup cookie = login(EMAIL, PWD) district = [] if cookie is None: print "Login fail" else: square_result = get_request(DOMAIN + '/Forums/', cookie=cookie) if square_result['status']: file = xlwt.Workbook(encoding='utf-8') square_content = square_result['content'] soup = BeautifulSoup(square_content) # city city_name = tryutf8( soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0]) # day day_number = int( regex_find( r'[0-9]+', BeautifulSoup(str(soup.find_all( 'div', class_='Clock')[0])).find_all('p')[1].string)[0]) # district soup_districts = soup.find_all('div', class_='District', href=True) for district_item in soup_districts: