def run(self): global FINISH global page_count for i in range(RETRY): try: result = get_request( '%s/Forums/Statistics/?Page=%s' % (DOMAIN, self.page_num), cookie=self.cookie) if result['status']: content = tryutf8(result['content']) soup = BeautifulSoup(content) status_rows = soup.find_all('div', class_='StatisticsRow') for status in status_rows: b = BeautifulSoup(str(status)) b_tag = b.find_all('p', class_='Number') works = [] if len(b_tag) == 9: works = [b.string for b in b_tag] works[0] = works[0][1:-1] threadLock.acquire() WORKS[b_tag[0].string] = works threadLock.release() break except Exception, e: print self.page_num, e
def run(self): global FINISH global page_count for i in range(SPEECH_RETRY): try: result = get_request( '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % (DOMAIN, self.speech_type, self.page_num), cookie=self.cookie) if result['status']: content = result['content'] content = tryutf8(content) soup = BeautifulSoup(content) speech_items = soup.find_all('div', class_='Speech') for speech_item in speech_items: speech = {} b = BeautifulSoup(str(speech_item)) # day and time day_sign_str = '本地演讲,第'.decode('utf-8') day_tag = b.find_all('p', text=re.compile(r'%s.+' % day_sign_str)) if day_tag: day_tag = day_tag[0] result = regex_find(r'[0-9]+', day_tag.string) # day speech['day'] = int(result[0]) # hour speech['hour'] = int(result[1]) # minute speech['minute'] = int(result[2]) # speech id speechId = b.find_all('div', class_='Speech')[0]['speechid'] speech['id'] = speechId # author speech['author'] = tryutf8( b.find_all('a', class_='WithEntityCard', href=True)[1].string) # content speech['content'] = tryutf8( b.find_all('p', class_='')[0].text) # Links hrefs = b.find_all('a', class_='', href=True) if hrefs: for href in hrefs: href_text = href.string # tag if href_text[0] == '#': speech['tag'] = tryutf8(href_text[1:-1]) # link elif href_text[0] == '(': if not 'links' in speech: speech['links'] = [] speech['links'].append([ tryutf8(href_text[1:-1]), href['href'] ]) # like like_tag = b.find_all('span', class_='Number', type='1') if like_tag: like_tag = like_tag[0] speech['like'] = int(like_tag.string[1:-1]) # watch watch_tag = b.find_all('span', class_='Number', type='3') if watch_tag: watch_tag = watch_tag[0] speech['watch'] = int(watch_tag.string[1:-1]) # dislike dislike_tag = b.find_all('span', class_='Number', type='2') if dislike_tag: dislike_tag = dislike_tag[0] speech['dislike'] = int(dislike_tag.string[1:-1]) threadLock.acquire() # print speechId, speech['content'] SPEECHES[speechId] = speech threadLock.release() break except Exception, e: print self.page_num, e
import xlwt from bs4 import BeautifulSoup cookie = login(EMAIL, PWD) district = [] if cookie is None: print "Login fail" else: square_result = get_request( DOMAIN + '/Forums/', cookie=cookie) if square_result['status']: file = xlwt.Workbook(encoding='utf-8') square_content = square_result['content'] soup = BeautifulSoup(square_content) # city city_name = tryutf8( soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0]) # day day_number = int(regex_find( r'[0-9]+', BeautifulSoup(str(soup.find_all('div', class_='Clock')[0])).find_all('p')[1].string)[0]) # district soup_districts = soup.find_all('div', class_='District', href=True) for district_item in soup_districts: soup_district_tag = BeautifulSoup(str(district_item)).find_all( 'a', class_=False, href=True)[0] district.append([int(regex_find( r'[0-9]+', str(soup_district_tag))[0]), tryutf8(soup_district_tag.string)]) for district_item in district: district_id = district_item[0] district_name = district_item[1] estates = [] district_result = get_request(
def run(self): global FINISH global page_count for i in range(SPEECH_RETRY): try: result = get_request( '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % ( DOMAIN, self.speech_type, self.page_num), cookie=self.cookie) if result['status']: content = result['content'] content = tryutf8(content) soup = BeautifulSoup(content) speech_items = soup.find_all('div', class_='Speech') for speech_item in speech_items: speech = {} b = BeautifulSoup(str(speech_item)) # day and time day_sign_str = '本地演讲,第'.decode('utf-8') day_tag = b.find_all( 'p', text=re.compile(r'%s.+' % day_sign_str)) if day_tag: day_tag = day_tag[0] result = regex_find(r'[0-9]+', day_tag.string) # day speech['day'] = int(result[0]) # hour speech['hour'] = int(result[1]) # minute speech['minute'] = int(result[2]) # speech id speechId = b.find_all( 'div', class_='Speech')[0]['speechid'] speech['id'] = speechId # author speech['author'] = tryutf8( b.find_all('a', class_='WithEntityCard', href=True)[1].string) # content speech['content'] = tryutf8( b.find_all('p', class_='')[0].text) # Links hrefs = b.find_all('a', class_='', href=True) if hrefs: for href in hrefs: href_text = href.string # tag if href_text[0] == '#': speech['tag'] = tryutf8(href_text[1:-1]) # link elif href_text[0] == '(': if not 'links' in speech: speech['links'] = [] speech['links'].append( [tryutf8(href_text[1:-1]), href['href']]) # like like_tag = b.find_all( 'span', class_='Number', type='1') if like_tag: like_tag = like_tag[0] speech['like'] = int(like_tag.string[1:-1]) # watch watch_tag = b.find_all( 'span', class_='Number', type='3') if watch_tag: watch_tag = watch_tag[0] speech['watch'] = int(watch_tag.string[1:-1]) # dislike dislike_tag = b.find_all( 'span', class_='Number', type='2') if dislike_tag: dislike_tag = dislike_tag[0] speech['dislike'] = int(dislike_tag.string[1:-1]) threadLock.acquire() # print speechId, speech['content'] SPEECHES[speechId] = speech threadLock.release() break except Exception, e: print self.page_num, e
import os import xlwt from bs4 import BeautifulSoup cookie = login(EMAIL, PWD) district = [] if cookie is None: print "Login fail" else: square_result = get_request(DOMAIN + '/Forums/', cookie=cookie) if square_result['status']: file = xlwt.Workbook(encoding='utf-8') square_content = square_result['content'] soup = BeautifulSoup(square_content) # city city_name = tryutf8( soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0]) # day day_number = int( regex_find( r'[0-9]+', BeautifulSoup(str(soup.find_all( 'div', class_='Clock')[0])).find_all('p')[1].string)[0]) # district soup_districts = soup.find_all('div', class_='District', href=True) for district_item in soup_districts: soup_district_tag = BeautifulSoup(str(district_item)).find_all( 'a', class_=False, href=True)[0] district.append([ int(regex_find(r'[0-9]+', str(soup_district_tag))[0]), tryutf8(soup_district_tag.string) ])