Esempio n. 1
0
 def run(self):
     global FINISH
     global page_count
     for i in range(RETRY):
         try:
             result = get_request(
                 '%s/Forums/Statistics/?Page=%s' % (DOMAIN, self.page_num), cookie=self.cookie)
             if result['status']:
                 content = tryutf8(result['content'])
                 soup = BeautifulSoup(content)
                 status_rows = soup.find_all('div', class_='StatisticsRow')
                 for status in status_rows:
                     b = BeautifulSoup(str(status))
                     b_tag = b.find_all('p', class_='Number')
                     works = []
                     if len(b_tag) == 9:
                         works = [b.string for b in b_tag]
                         works[0] = works[0][1:-1]
                         threadLock.acquire()
                         WORKS[b_tag[0].string] = works
                         threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Esempio n. 2
0
 def run(self):
     global FINISH
     global page_count
     for i in range(SPEECH_RETRY):
         try:
             result = get_request(
                 '%s/Forums/Speeches/?SpeechType=%s&Page=%s' %
                 (DOMAIN, self.speech_type, self.page_num),
                 cookie=self.cookie)
             if result['status']:
                 content = result['content']
                 content = tryutf8(content)
                 soup = BeautifulSoup(content)
                 speech_items = soup.find_all('div', class_='Speech')
                 for speech_item in speech_items:
                     speech = {}
                     b = BeautifulSoup(str(speech_item))
                     # day and time
                     day_sign_str = '本地演讲,第'.decode('utf-8')
                     day_tag = b.find_all('p',
                                          text=re.compile(r'%s.+' %
                                                          day_sign_str))
                     if day_tag:
                         day_tag = day_tag[0]
                         result = regex_find(r'[0-9]+', day_tag.string)
                         # day
                         speech['day'] = int(result[0])
                         # hour
                         speech['hour'] = int(result[1])
                         # minute
                         speech['minute'] = int(result[2])
                     # speech id
                     speechId = b.find_all('div',
                                           class_='Speech')[0]['speechid']
                     speech['id'] = speechId
                     # author
                     speech['author'] = tryutf8(
                         b.find_all('a', class_='WithEntityCard',
                                    href=True)[1].string)
                     # content
                     speech['content'] = tryutf8(
                         b.find_all('p', class_='')[0].text)
                     # Links
                     hrefs = b.find_all('a', class_='', href=True)
                     if hrefs:
                         for href in hrefs:
                             href_text = href.string
                             # tag
                             if href_text[0] == '#':
                                 speech['tag'] = tryutf8(href_text[1:-1])
                             # link
                             elif href_text[0] == '(':
                                 if not 'links' in speech:
                                     speech['links'] = []
                                 speech['links'].append([
                                     tryutf8(href_text[1:-1]), href['href']
                                 ])
                     # like
                     like_tag = b.find_all('span',
                                           class_='Number',
                                           type='1')
                     if like_tag:
                         like_tag = like_tag[0]
                     speech['like'] = int(like_tag.string[1:-1])
                     # watch
                     watch_tag = b.find_all('span',
                                            class_='Number',
                                            type='3')
                     if watch_tag:
                         watch_tag = watch_tag[0]
                     speech['watch'] = int(watch_tag.string[1:-1])
                     # dislike
                     dislike_tag = b.find_all('span',
                                              class_='Number',
                                              type='2')
                     if dislike_tag:
                         dislike_tag = dislike_tag[0]
                     speech['dislike'] = int(dislike_tag.string[1:-1])
                     threadLock.acquire()
                     # print speechId, speech['content']
                     SPEECHES[speechId] = speech
                     threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Esempio n. 3
0
import xlwt
from bs4 import BeautifulSoup

cookie = login(EMAIL, PWD)
district = []
if cookie is None:
    print "Login fail"
else:
    square_result = get_request(
        DOMAIN + '/Forums/', cookie=cookie)
    if square_result['status']:
        file = xlwt.Workbook(encoding='utf-8')
        square_content = square_result['content']
        soup = BeautifulSoup(square_content)
        # city
        city_name = tryutf8(
            soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0])
        # day
        day_number = int(regex_find(
            r'[0-9]+', BeautifulSoup(str(soup.find_all('div', class_='Clock')[0])).find_all('p')[1].string)[0])
        # district
        soup_districts = soup.find_all('div', class_='District', href=True)
        for district_item in soup_districts:
            soup_district_tag = BeautifulSoup(str(district_item)).find_all(
                'a', class_=False, href=True)[0]
            district.append([int(regex_find(
                r'[0-9]+', str(soup_district_tag))[0]),  tryutf8(soup_district_tag.string)])
        for district_item in district:
            district_id = district_item[0]
            district_name = district_item[1]
            estates = []
            district_result = get_request(
Esempio n. 4
0
 def run(self):
     global FINISH
     global page_count
     for i in range(SPEECH_RETRY):
         try:
             result = get_request(
                 '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % (
                     DOMAIN, self.speech_type, self.page_num),
                 cookie=self.cookie)
             if result['status']:
                 content = result['content']
                 content = tryutf8(content)
                 soup = BeautifulSoup(content)
                 speech_items = soup.find_all('div', class_='Speech')
                 for speech_item in speech_items:
                     speech = {}
                     b = BeautifulSoup(str(speech_item))
                     # day and time
                     day_sign_str = '本地演讲,第'.decode('utf-8')
                     day_tag = b.find_all(
                         'p', text=re.compile(r'%s.+' % day_sign_str))
                     if day_tag:
                         day_tag = day_tag[0]
                         result = regex_find(r'[0-9]+', day_tag.string)
                         # day
                         speech['day'] = int(result[0])
                         # hour
                         speech['hour'] = int(result[1])
                         # minute
                         speech['minute'] = int(result[2])
                     # speech id
                     speechId = b.find_all(
                         'div', class_='Speech')[0]['speechid']
                     speech['id'] = speechId
                     # author
                     speech['author'] = tryutf8(
                         b.find_all('a', class_='WithEntityCard', href=True)[1].string)
                     # content
                     speech['content'] = tryutf8(
                         b.find_all('p', class_='')[0].text)
                     # Links
                     hrefs = b.find_all('a', class_='', href=True)
                     if hrefs:
                         for href in hrefs:
                             href_text = href.string
                             # tag
                             if href_text[0] == '#':
                                 speech['tag'] = tryutf8(href_text[1:-1])
                             # link
                             elif href_text[0] == '(':
                                 if not 'links' in speech:
                                     speech['links'] = []
                                 speech['links'].append(
                                     [tryutf8(href_text[1:-1]), href['href']])
                     # like
                     like_tag = b.find_all(
                         'span', class_='Number', type='1')
                     if like_tag:
                         like_tag = like_tag[0]
                     speech['like'] = int(like_tag.string[1:-1])
                     # watch
                     watch_tag = b.find_all(
                         'span', class_='Number', type='3')
                     if watch_tag:
                         watch_tag = watch_tag[0]
                     speech['watch'] = int(watch_tag.string[1:-1])
                     # dislike
                     dislike_tag = b.find_all(
                         'span', class_='Number', type='2')
                     if dislike_tag:
                         dislike_tag = dislike_tag[0]
                     speech['dislike'] = int(dislike_tag.string[1:-1])
                     threadLock.acquire()
                     # print speechId, speech['content']
                     SPEECHES[speechId] = speech
                     threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Esempio n. 5
0
import os
import xlwt
from bs4 import BeautifulSoup

cookie = login(EMAIL, PWD)
district = []
if cookie is None:
    print "Login fail"
else:
    square_result = get_request(DOMAIN + '/Forums/', cookie=cookie)
    if square_result['status']:
        file = xlwt.Workbook(encoding='utf-8')
        square_content = square_result['content']
        soup = BeautifulSoup(square_content)
        # city
        city_name = tryutf8(
            soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0])
        # day
        day_number = int(
            regex_find(
                r'[0-9]+',
                BeautifulSoup(str(soup.find_all(
                    'div', class_='Clock')[0])).find_all('p')[1].string)[0])
        # district
        soup_districts = soup.find_all('div', class_='District', href=True)
        for district_item in soup_districts:
            soup_district_tag = BeautifulSoup(str(district_item)).find_all(
                'a', class_=False, href=True)[0]
            district.append([
                int(regex_find(r'[0-9]+', str(soup_district_tag))[0]),
                tryutf8(soup_district_tag.string)
            ])