print "Login fail" else: write_file( 'estate.txt', 'w+', 'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发') for i in range(1, 50000): result = get_request('http://civitas.soobb.com/Estates/' + str(i) + '/Details/', cookie=cookie) if result['status']: content = result['content'] estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = '' estateDevelop = estateArea = estatePeople = {} # Check ID idList = regex_find( r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">', content) if len(idList) == 1: id = re.compile( r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)' ).sub(r'\2', idList[0]) if str(id) != str(i): continue # Estate name nameList = regex_find(r'<a href="/Estates/[0-9]+/Details/">.+</a>', content) if len(nameList) == 1: estateName = re.compile( r'<a href="/Estates/[0-9]+/Details/">(.+)</a>').sub( r'\1', nameList[0]) if estateName == '':
cookie = login(EMAIL, PWD) if cookie is None: print "Login fail" else: write_file('estate.txt', 'w+', 'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发') for i in range(1, 50000): result = get_request( 'http://civitas.soobb.com/Estates/' + str(i) + '/Details/', cookie=cookie) if result['status']: content = result['content'] estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = '' estateDevelop = estateArea = estatePeople = {} # Check ID idList = regex_find( r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">', content) if len(idList) == 1: id = re.compile( r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)').sub(r'\2', idList[0]) if str(id) != str(i): continue # Estate name nameList = regex_find( r'<a href="/Estates/[0-9]+/Details/">.+</a>', content) if len(nameList) == 1: estateName = re.compile( r'<a href="/Estates/[0-9]+/Details/">(.+)</a>').sub(r'\1', nameList[0]) if estateName == '': estateStatus = '消失' # Estate type and status typeList = regex_find(r'</a><span>\(.+\)</span></h3>', content)
def run(self): global FINISH global page_count for i in range(SPEECH_RETRY): try: result = get_request( '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % (DOMAIN, self.speech_type, self.page_num), cookie=self.cookie) if result['status']: content = result['content'] content = tryutf8(content) soup = BeautifulSoup(content) speech_items = soup.find_all('div', class_='Speech') for speech_item in speech_items: speech = {} b = BeautifulSoup(str(speech_item)) # day and time day_sign_str = '本地演讲,第'.decode('utf-8') day_tag = b.find_all('p', text=re.compile(r'%s.+' % day_sign_str)) if day_tag: day_tag = day_tag[0] result = regex_find(r'[0-9]+', day_tag.string) # day speech['day'] = int(result[0]) # hour speech['hour'] = int(result[1]) # minute speech['minute'] = int(result[2]) # speech id speechId = b.find_all('div', class_='Speech')[0]['speechid'] speech['id'] = speechId # author speech['author'] = tryutf8( b.find_all('a', class_='WithEntityCard', href=True)[1].string) # content speech['content'] = tryutf8( b.find_all('p', class_='')[0].text) # Links hrefs = b.find_all('a', class_='', href=True) if hrefs: for href in hrefs: href_text = href.string # tag if href_text[0] == '#': speech['tag'] = tryutf8(href_text[1:-1]) # link elif href_text[0] == '(': if not 'links' in speech: speech['links'] = [] speech['links'].append([ tryutf8(href_text[1:-1]), href['href'] ]) # like like_tag = b.find_all('span', class_='Number', type='1') if like_tag: like_tag = like_tag[0] speech['like'] = int(like_tag.string[1:-1]) # watch watch_tag = b.find_all('span', class_='Number', type='3') if watch_tag: watch_tag = watch_tag[0] speech['watch'] = int(watch_tag.string[1:-1]) # dislike dislike_tag = b.find_all('span', class_='Number', type='2') if dislike_tag: dislike_tag = dislike_tag[0] speech['dislike'] = int(dislike_tag.string[1:-1]) threadLock.acquire() # print speechId, speech['content'] SPEECHES[speechId] = speech threadLock.release() break except Exception, e: print self.page_num, e
def run(self): global FINISH global page_count for i in range(SPEECH_RETRY): try: result = get_request( '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % ( DOMAIN, self.speech_type, self.page_num), cookie=self.cookie) if result['status']: content = result['content'] content = tryutf8(content) soup = BeautifulSoup(content) speech_items = soup.find_all('div', class_='Speech') for speech_item in speech_items: speech = {} b = BeautifulSoup(str(speech_item)) # day and time day_sign_str = '本地演讲,第'.decode('utf-8') day_tag = b.find_all( 'p', text=re.compile(r'%s.+' % day_sign_str)) if day_tag: day_tag = day_tag[0] result = regex_find(r'[0-9]+', day_tag.string) # day speech['day'] = int(result[0]) # hour speech['hour'] = int(result[1]) # minute speech['minute'] = int(result[2]) # speech id speechId = b.find_all( 'div', class_='Speech')[0]['speechid'] speech['id'] = speechId # author speech['author'] = tryutf8( b.find_all('a', class_='WithEntityCard', href=True)[1].string) # content speech['content'] = tryutf8( b.find_all('p', class_='')[0].text) # Links hrefs = b.find_all('a', class_='', href=True) if hrefs: for href in hrefs: href_text = href.string # tag if href_text[0] == '#': speech['tag'] = tryutf8(href_text[1:-1]) # link elif href_text[0] == '(': if not 'links' in speech: speech['links'] = [] speech['links'].append( [tryutf8(href_text[1:-1]), href['href']]) # like like_tag = b.find_all( 'span', class_='Number', type='1') if like_tag: like_tag = like_tag[0] speech['like'] = int(like_tag.string[1:-1]) # watch watch_tag = b.find_all( 'span', class_='Number', type='3') if watch_tag: watch_tag = watch_tag[0] speech['watch'] = int(watch_tag.string[1:-1]) # dislike dislike_tag = b.find_all( 'span', class_='Number', type='2') if dislike_tag: dislike_tag = dislike_tag[0] speech['dislike'] = int(dislike_tag.string[1:-1]) threadLock.acquire() # print speechId, speech['content'] SPEECHES[speechId] = speech threadLock.release() break except Exception, e: print self.page_num, e
cookie = login(EMAIL, PWD) district = [] if cookie is None: print "Login fail" else: square_result = get_request( DOMAIN + '/Forums/', cookie=cookie) if square_result['status']: file = xlwt.Workbook(encoding='utf-8') square_content = square_result['content'] soup = BeautifulSoup(square_content) # city city_name = tryutf8( soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0]) # day day_number = int(regex_find( r'[0-9]+', BeautifulSoup(str(soup.find_all('div', class_='Clock')[0])).find_all('p')[1].string)[0]) # district soup_districts = soup.find_all('div', class_='District', href=True) for district_item in soup_districts: soup_district_tag = BeautifulSoup(str(district_item)).find_all( 'a', class_=False, href=True)[0] district.append([int(regex_find( r'[0-9]+', str(soup_district_tag))[0]), tryutf8(soup_district_tag.string)]) for district_item in district: district_id = district_item[0] district_name = district_item[1] estates = [] district_result = get_request( DOMAIN + '/Districts/' + str(district_id) + '/Estates/', cookie=cookie) if district_result['status']: district_content = district_result['content']
district = [] if cookie is None: print "Login fail" else: square_result = get_request(DOMAIN + '/Forums/', cookie=cookie) if square_result['status']: file = xlwt.Workbook(encoding='utf-8') square_content = square_result['content'] soup = BeautifulSoup(square_content) # city city_name = tryutf8( soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0]) # day day_number = int( regex_find( r'[0-9]+', BeautifulSoup(str(soup.find_all( 'div', class_='Clock')[0])).find_all('p')[1].string)[0]) # district soup_districts = soup.find_all('div', class_='District', href=True) for district_item in soup_districts: soup_district_tag = BeautifulSoup(str(district_item)).find_all( 'a', class_=False, href=True)[0] district.append([ int(regex_find(r'[0-9]+', str(soup_district_tag))[0]), tryutf8(soup_district_tag.string) ]) for district_item in district: district_id = district_item[0] district_name = district_item[1] estates = [] district_result = get_request(DOMAIN + '/Districts/' +