Beispiel #1
0
    print "Login fail"
else:
    write_file(
        'estate.txt', 'w+',
        'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发')
    for i in range(1, 50000):
        result = get_request('http://civitas.soobb.com/Estates/' + str(i) +
                             '/Details/',
                             cookie=cookie)
        if result['status']:
            content = result['content']
            estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = ''
            estateDevelop = estateArea = estatePeople = {}
            # Check ID
            idList = regex_find(
                r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">',
                content)
            if len(idList) == 1:
                id = re.compile(
                    r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)'
                ).sub(r'\2', idList[0])
                if str(id) != str(i):
                    continue
            # Estate name
            nameList = regex_find(r'<a href="/Estates/[0-9]+/Details/">.+</a>',
                                  content)
            if len(nameList) == 1:
                estateName = re.compile(
                    r'<a href="/Estates/[0-9]+/Details/">(.+)</a>').sub(
                        r'\1', nameList[0])
            if estateName == '':
Beispiel #2
0
cookie = login(EMAIL, PWD)
if cookie is None:
    print "Login fail"
else:
    write_file('estate.txt', 'w+',
               'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发')
    for i in range(1, 50000):
        result = get_request(
            'http://civitas.soobb.com/Estates/' + str(i) + '/Details/', cookie=cookie)
        if result['status']:
            content = result['content']
            estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = ''
            estateDevelop = estateArea = estatePeople = {}
            # Check ID
            idList = regex_find(
                r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">', content)
            if len(idList) == 1:
                id = re.compile(
                    r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)').sub(r'\2', idList[0])
                if str(id) != str(i):
                    continue
            # Estate name
            nameList = regex_find(
                r'<a href="/Estates/[0-9]+/Details/">.+</a>', content)
            if len(nameList) == 1:
                estateName = re.compile(
                    r'<a href="/Estates/[0-9]+/Details/">(.+)</a>').sub(r'\1', nameList[0])
            if estateName == '':
                estateStatus = '消失'
            # Estate type and status
            typeList = regex_find(r'</a><span>\(.+\)</span></h3>', content)
Beispiel #3
0
 def run(self):
     global FINISH
     global page_count
     for i in range(SPEECH_RETRY):
         try:
             result = get_request(
                 '%s/Forums/Speeches/?SpeechType=%s&Page=%s' %
                 (DOMAIN, self.speech_type, self.page_num),
                 cookie=self.cookie)
             if result['status']:
                 content = result['content']
                 content = tryutf8(content)
                 soup = BeautifulSoup(content)
                 speech_items = soup.find_all('div', class_='Speech')
                 for speech_item in speech_items:
                     speech = {}
                     b = BeautifulSoup(str(speech_item))
                     # day and time
                     day_sign_str = '本地演讲,第'.decode('utf-8')
                     day_tag = b.find_all('p',
                                          text=re.compile(r'%s.+' %
                                                          day_sign_str))
                     if day_tag:
                         day_tag = day_tag[0]
                         result = regex_find(r'[0-9]+', day_tag.string)
                         # day
                         speech['day'] = int(result[0])
                         # hour
                         speech['hour'] = int(result[1])
                         # minute
                         speech['minute'] = int(result[2])
                     # speech id
                     speechId = b.find_all('div',
                                           class_='Speech')[0]['speechid']
                     speech['id'] = speechId
                     # author
                     speech['author'] = tryutf8(
                         b.find_all('a', class_='WithEntityCard',
                                    href=True)[1].string)
                     # content
                     speech['content'] = tryutf8(
                         b.find_all('p', class_='')[0].text)
                     # Links
                     hrefs = b.find_all('a', class_='', href=True)
                     if hrefs:
                         for href in hrefs:
                             href_text = href.string
                             # tag
                             if href_text[0] == '#':
                                 speech['tag'] = tryutf8(href_text[1:-1])
                             # link
                             elif href_text[0] == '(':
                                 if not 'links' in speech:
                                     speech['links'] = []
                                 speech['links'].append([
                                     tryutf8(href_text[1:-1]), href['href']
                                 ])
                     # like
                     like_tag = b.find_all('span',
                                           class_='Number',
                                           type='1')
                     if like_tag:
                         like_tag = like_tag[0]
                     speech['like'] = int(like_tag.string[1:-1])
                     # watch
                     watch_tag = b.find_all('span',
                                            class_='Number',
                                            type='3')
                     if watch_tag:
                         watch_tag = watch_tag[0]
                     speech['watch'] = int(watch_tag.string[1:-1])
                     # dislike
                     dislike_tag = b.find_all('span',
                                              class_='Number',
                                              type='2')
                     if dislike_tag:
                         dislike_tag = dislike_tag[0]
                     speech['dislike'] = int(dislike_tag.string[1:-1])
                     threadLock.acquire()
                     # print speechId, speech['content']
                     SPEECHES[speechId] = speech
                     threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Beispiel #4
0
 def run(self):
     global FINISH
     global page_count
     for i in range(SPEECH_RETRY):
         try:
             result = get_request(
                 '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % (
                     DOMAIN, self.speech_type, self.page_num),
                 cookie=self.cookie)
             if result['status']:
                 content = result['content']
                 content = tryutf8(content)
                 soup = BeautifulSoup(content)
                 speech_items = soup.find_all('div', class_='Speech')
                 for speech_item in speech_items:
                     speech = {}
                     b = BeautifulSoup(str(speech_item))
                     # day and time
                     day_sign_str = '本地演讲,第'.decode('utf-8')
                     day_tag = b.find_all(
                         'p', text=re.compile(r'%s.+' % day_sign_str))
                     if day_tag:
                         day_tag = day_tag[0]
                         result = regex_find(r'[0-9]+', day_tag.string)
                         # day
                         speech['day'] = int(result[0])
                         # hour
                         speech['hour'] = int(result[1])
                         # minute
                         speech['minute'] = int(result[2])
                     # speech id
                     speechId = b.find_all(
                         'div', class_='Speech')[0]['speechid']
                     speech['id'] = speechId
                     # author
                     speech['author'] = tryutf8(
                         b.find_all('a', class_='WithEntityCard', href=True)[1].string)
                     # content
                     speech['content'] = tryutf8(
                         b.find_all('p', class_='')[0].text)
                     # Links
                     hrefs = b.find_all('a', class_='', href=True)
                     if hrefs:
                         for href in hrefs:
                             href_text = href.string
                             # tag
                             if href_text[0] == '#':
                                 speech['tag'] = tryutf8(href_text[1:-1])
                             # link
                             elif href_text[0] == '(':
                                 if not 'links' in speech:
                                     speech['links'] = []
                                 speech['links'].append(
                                     [tryutf8(href_text[1:-1]), href['href']])
                     # like
                     like_tag = b.find_all(
                         'span', class_='Number', type='1')
                     if like_tag:
                         like_tag = like_tag[0]
                     speech['like'] = int(like_tag.string[1:-1])
                     # watch
                     watch_tag = b.find_all(
                         'span', class_='Number', type='3')
                     if watch_tag:
                         watch_tag = watch_tag[0]
                     speech['watch'] = int(watch_tag.string[1:-1])
                     # dislike
                     dislike_tag = b.find_all(
                         'span', class_='Number', type='2')
                     if dislike_tag:
                         dislike_tag = dislike_tag[0]
                     speech['dislike'] = int(dislike_tag.string[1:-1])
                     threadLock.acquire()
                     # print speechId, speech['content']
                     SPEECHES[speechId] = speech
                     threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Beispiel #5
0
cookie = login(EMAIL, PWD)
district = []
if cookie is None:
    print "Login fail"
else:
    square_result = get_request(
        DOMAIN + '/Forums/', cookie=cookie)
    if square_result['status']:
        file = xlwt.Workbook(encoding='utf-8')
        square_content = square_result['content']
        soup = BeautifulSoup(square_content)
        # city
        city_name = tryutf8(
            soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0])
        # day
        day_number = int(regex_find(
            r'[0-9]+', BeautifulSoup(str(soup.find_all('div', class_='Clock')[0])).find_all('p')[1].string)[0])
        # district
        soup_districts = soup.find_all('div', class_='District', href=True)
        for district_item in soup_districts:
            soup_district_tag = BeautifulSoup(str(district_item)).find_all(
                'a', class_=False, href=True)[0]
            district.append([int(regex_find(
                r'[0-9]+', str(soup_district_tag))[0]),  tryutf8(soup_district_tag.string)])
        for district_item in district:
            district_id = district_item[0]
            district_name = district_item[1]
            estates = []
            district_result = get_request(
                DOMAIN + '/Districts/' + str(district_id) + '/Estates/', cookie=cookie)
            if district_result['status']:
                district_content = district_result['content']
Beispiel #6
0
district = []
if cookie is None:
    print "Login fail"
else:
    square_result = get_request(DOMAIN + '/Forums/', cookie=cookie)
    if square_result['status']:
        file = xlwt.Workbook(encoding='utf-8')
        square_content = square_result['content']
        soup = BeautifulSoup(square_content)
        # city
        city_name = tryutf8(
            soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0])
        # day
        day_number = int(
            regex_find(
                r'[0-9]+',
                BeautifulSoup(str(soup.find_all(
                    'div', class_='Clock')[0])).find_all('p')[1].string)[0])
        # district
        soup_districts = soup.find_all('div', class_='District', href=True)
        for district_item in soup_districts:
            soup_district_tag = BeautifulSoup(str(district_item)).find_all(
                'a', class_=False, href=True)[0]
            district.append([
                int(regex_find(r'[0-9]+', str(soup_district_tag))[0]),
                tryutf8(soup_district_tag.string)
            ])
        for district_item in district:
            district_id = district_item[0]
            district_name = district_item[1]
            estates = []
            district_result = get_request(DOMAIN + '/Districts/' +