Exemple #1
0
 def run(self):
     global FINISH
     global page_count
     for i in range(RETRY):
         try:
             result = get_request(
                 '%s/Forums/Statistics/?Page=%s' % (DOMAIN, self.page_num), cookie=self.cookie)
             if result['status']:
                 content = tryutf8(result['content'])
                 soup = BeautifulSoup(content)
                 status_rows = soup.find_all('div', class_='StatisticsRow')
                 for status in status_rows:
                     b = BeautifulSoup(str(status))
                     b_tag = b.find_all('p', class_='Number')
                     works = []
                     if len(b_tag) == 9:
                         works = [b.string for b in b_tag]
                         works[0] = works[0][1:-1]
                         threadLock.acquire()
                         WORKS[b_tag[0].string] = works
                         threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Exemple #2
0
#-*- coding: utf-8 -*-
import urllib2
import re
from mccblackteck import get_request, login, regex_find, try_get_value, write_file
from account import EMAIL, PWD

cookie = login(EMAIL, PWD)
if cookie is None:
    print "Login fail"
else:
    write_file(
        'estate.txt', 'w+',
        'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发')
    for i in range(1, 50000):
        result = get_request('http://civitas.soobb.com/Estates/' + str(i) +
                             '/Details/',
                             cookie=cookie)
        if result['status']:
            content = result['content']
            estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = ''
            estateDevelop = estateArea = estatePeople = {}
            # Check ID
            idList = regex_find(
                r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">',
                content)
            if len(idList) == 1:
                id = re.compile(
                    r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)'
                ).sub(r'\2', idList[0])
                if str(id) != str(i):
                    continue
Exemple #3
0
            except Exception, e:
                print self.page_num, e
        threadLock.acquire()
        FINISH += 1
        print_processbar(page_count, FINISH)
        threadLock.release()


cookie = login(EMAIL, PWD)
if cookie is None:
    print "Login fail"
else:
    if not os.path.exists('works'):
        os.makedirs('works')
    # get pageCount
    result = get_request(
        "%s/Forums/Statistics/" % (DOMAIN), cookie=cookie)
    if result['status']:
        content = result['content']
        pagination = BeautifulSoup(content).find_all(
            'div', class_='Pagination')
        if pagination:
            page_count = int(
                BeautifulSoup(str(pagination[0])).find_all('a', href=True)[-2].string)
    threads = []
    for i in range(1, page_count + 1):
        threads.append(getWorks(i, cookie))
    for thread in threads:
        thread.start()
        while True:
            if(len(threading.enumerate()) < THREAD_SIZE):
                break
Exemple #4
0
 def run(self):
     global FINISH
     global page_count
     for i in range(SPEECH_RETRY):
         try:
             result = get_request(
                 '%s/Forums/Speeches/?SpeechType=%s&Page=%s' %
                 (DOMAIN, self.speech_type, self.page_num),
                 cookie=self.cookie)
             if result['status']:
                 content = result['content']
                 content = tryutf8(content)
                 soup = BeautifulSoup(content)
                 speech_items = soup.find_all('div', class_='Speech')
                 for speech_item in speech_items:
                     speech = {}
                     b = BeautifulSoup(str(speech_item))
                     # day and time
                     day_sign_str = '本地演讲,第'.decode('utf-8')
                     day_tag = b.find_all('p',
                                          text=re.compile(r'%s.+' %
                                                          day_sign_str))
                     if day_tag:
                         day_tag = day_tag[0]
                         result = regex_find(r'[0-9]+', day_tag.string)
                         # day
                         speech['day'] = int(result[0])
                         # hour
                         speech['hour'] = int(result[1])
                         # minute
                         speech['minute'] = int(result[2])
                     # speech id
                     speechId = b.find_all('div',
                                           class_='Speech')[0]['speechid']
                     speech['id'] = speechId
                     # author
                     speech['author'] = tryutf8(
                         b.find_all('a', class_='WithEntityCard',
                                    href=True)[1].string)
                     # content
                     speech['content'] = tryutf8(
                         b.find_all('p', class_='')[0].text)
                     # Links
                     hrefs = b.find_all('a', class_='', href=True)
                     if hrefs:
                         for href in hrefs:
                             href_text = href.string
                             # tag
                             if href_text[0] == '#':
                                 speech['tag'] = tryutf8(href_text[1:-1])
                             # link
                             elif href_text[0] == '(':
                                 if not 'links' in speech:
                                     speech['links'] = []
                                 speech['links'].append([
                                     tryutf8(href_text[1:-1]), href['href']
                                 ])
                     # like
                     like_tag = b.find_all('span',
                                           class_='Number',
                                           type='1')
                     if like_tag:
                         like_tag = like_tag[0]
                     speech['like'] = int(like_tag.string[1:-1])
                     # watch
                     watch_tag = b.find_all('span',
                                            class_='Number',
                                            type='3')
                     if watch_tag:
                         watch_tag = watch_tag[0]
                     speech['watch'] = int(watch_tag.string[1:-1])
                     # dislike
                     dislike_tag = b.find_all('span',
                                              class_='Number',
                                              type='2')
                     if dislike_tag:
                         dislike_tag = dislike_tag[0]
                     speech['dislike'] = int(dislike_tag.string[1:-1])
                     threadLock.acquire()
                     # print speechId, speech['content']
                     SPEECHES[speechId] = speech
                     threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Exemple #5
0
#-*- coding: utf-8 -*-
import urllib2
import re
from mccblackteck import get_request, login, regex_find, try_get_value, write_file
from account import EMAIL, PWD

cookie = login(EMAIL, PWD)
if cookie is None:
    print "Login fail"
else:
    write_file('estate.txt', 'w+',
               'ID;城市;城区;地形;街区;状态;类型;名称;管理;拥有;产品;产品等级;建设;规划;占地;工作;居住;容纳;已工作;维护;开发')
    for i in range(1, 50000):
        result = get_request(
            'http://civitas.soobb.com/Estates/' + str(i) + '/Details/', cookie=cookie)
        if result['status']:
            content = result['content']
            estateName = estateType = estateStatus = estateLocation = estatePosition = estateManage = estateOwner = estateDistricts = estateProduct = estateProductLevel = estateCity = ''
            estateDevelop = estateArea = estatePeople = {}
            # Check ID
            idList = regex_find(
                r'<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/[0-9]+/Details/\" class=\"Normal\">', content)
            if len(idList) == 1:
                id = re.compile(
                    r'(<div class=\"Avatar AvatarMedium\"><a href=\"/Estates/)([0-9]+)(/Details/\" class=\"Normal\">)').sub(r'\2', idList[0])
                if str(id) != str(i):
                    continue
            # Estate name
            nameList = regex_find(
                r'<a href="/Estates/[0-9]+/Details/">.+</a>', content)
            if len(nameList) == 1:
Exemple #6
0
                print self.page_num, e
        threadLock.acquire()
        FINISH += 1
        print_processbar(page_count, FINISH)
        threadLock.release()


cookie = login(EMAIL, PWD)
if cookie is None:
    print "Login fail"
else:
    if not os.path.exists('speech'):
        os.makedirs('speech')
    # get pageCount
    result = get_request("%s/Forums/Speeches/?SpeechType=%s" %
                         (DOMAIN, SPEECH_TYPE),
                         cookie=cookie)
    if result['status']:
        content = result['content']
        pagination = BeautifulSoup(content).find_all('div',
                                                     class_='Pagination')
        if pagination:
            page_count = int(
                BeautifulSoup(str(pagination[0])).find_all(
                    'a', href=True)[-2].string)
    threads = []
    for i in range(1, page_count + 1):
        threads.append(getSpeech(i, cookie))
    for thread in threads:
        thread.start()
        while True:
Exemple #7
0
 def run(self):
     global FINISH
     global page_count
     for i in range(SPEECH_RETRY):
         try:
             result = get_request(
                 '%s/Forums/Speeches/?SpeechType=%s&Page=%s' % (
                     DOMAIN, self.speech_type, self.page_num),
                 cookie=self.cookie)
             if result['status']:
                 content = result['content']
                 content = tryutf8(content)
                 soup = BeautifulSoup(content)
                 speech_items = soup.find_all('div', class_='Speech')
                 for speech_item in speech_items:
                     speech = {}
                     b = BeautifulSoup(str(speech_item))
                     # day and time
                     day_sign_str = '本地演讲,第'.decode('utf-8')
                     day_tag = b.find_all(
                         'p', text=re.compile(r'%s.+' % day_sign_str))
                     if day_tag:
                         day_tag = day_tag[0]
                         result = regex_find(r'[0-9]+', day_tag.string)
                         # day
                         speech['day'] = int(result[0])
                         # hour
                         speech['hour'] = int(result[1])
                         # minute
                         speech['minute'] = int(result[2])
                     # speech id
                     speechId = b.find_all(
                         'div', class_='Speech')[0]['speechid']
                     speech['id'] = speechId
                     # author
                     speech['author'] = tryutf8(
                         b.find_all('a', class_='WithEntityCard', href=True)[1].string)
                     # content
                     speech['content'] = tryutf8(
                         b.find_all('p', class_='')[0].text)
                     # Links
                     hrefs = b.find_all('a', class_='', href=True)
                     if hrefs:
                         for href in hrefs:
                             href_text = href.string
                             # tag
                             if href_text[0] == '#':
                                 speech['tag'] = tryutf8(href_text[1:-1])
                             # link
                             elif href_text[0] == '(':
                                 if not 'links' in speech:
                                     speech['links'] = []
                                 speech['links'].append(
                                     [tryutf8(href_text[1:-1]), href['href']])
                     # like
                     like_tag = b.find_all(
                         'span', class_='Number', type='1')
                     if like_tag:
                         like_tag = like_tag[0]
                     speech['like'] = int(like_tag.string[1:-1])
                     # watch
                     watch_tag = b.find_all(
                         'span', class_='Number', type='3')
                     if watch_tag:
                         watch_tag = watch_tag[0]
                     speech['watch'] = int(watch_tag.string[1:-1])
                     # dislike
                     dislike_tag = b.find_all(
                         'span', class_='Number', type='2')
                     if dislike_tag:
                         dislike_tag = dislike_tag[0]
                     speech['dislike'] = int(dislike_tag.string[1:-1])
                     threadLock.acquire()
                     # print speechId, speech['content']
                     SPEECHES[speechId] = speech
                     threadLock.release()
             break
         except Exception, e:
             print self.page_num, e
Exemple #8
0
            except Exception, e:
                print self.page_num, e
        threadLock.acquire()
        FINISH += 1
        print_processbar(page_count, FINISH)
        threadLock.release()


cookie = login(EMAIL, PWD)
if cookie is None:
    print "Login fail"
else:
    if not os.path.exists('speech'):
        os.makedirs('speech')
    # get pageCount
    result = get_request(
        "%s/Forums/Speeches/?SpeechType=%s" % (DOMAIN, SPEECH_TYPE), cookie=cookie)
    if result['status']:
        content = result['content']
        pagination = BeautifulSoup(content).find_all(
            'div', class_='Pagination')
        if pagination:
            page_count = int(
                BeautifulSoup(str(pagination[0])).find_all('a', href=True)[-2].string)
    threads = []
    for i in range(1, page_count + 1):
        threads.append(getSpeech(i, cookie))
    for thread in threads:
        thread.start()
        while True:
            if(len(threading.enumerate()) < THREAD_SIZE):
                break
Exemple #9
0
#-*- coding: utf-8 -*-
import re
from mccblackteck import tryutf8, get_request, login, regex_find, DOMAIN
from account import EMAIL, PWD
import datetime
import os
import xlwt
from bs4 import BeautifulSoup

cookie = login(EMAIL, PWD)
district = []
if cookie is None:
    print "Login fail"
else:
    square_result = get_request(
        DOMAIN + '/Forums/', cookie=cookie)
    if square_result['status']:
        file = xlwt.Workbook(encoding='utf-8')
        square_content = square_result['content']
        soup = BeautifulSoup(square_content)
        # city
        city_name = tryutf8(
            soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0])
        # day
        day_number = int(regex_find(
            r'[0-9]+', BeautifulSoup(str(soup.find_all('div', class_='Clock')[0])).find_all('p')[1].string)[0])
        # district
        soup_districts = soup.find_all('div', class_='District', href=True)
        for district_item in soup_districts:
            soup_district_tag = BeautifulSoup(str(district_item)).find_all(
                'a', class_=False, href=True)[0]
Exemple #10
0
#-*- coding: utf-8 -*-
import re
from mccblackteck import tryutf8, get_request, login, regex_find, DOMAIN
from account import EMAIL, PWD
import datetime
import os
import xlwt
from bs4 import BeautifulSoup

cookie = login(EMAIL, PWD)
district = []
if cookie is None:
    print "Login fail"
else:
    square_result = get_request(DOMAIN + '/Forums/', cookie=cookie)
    if square_result['status']:
        file = xlwt.Workbook(encoding='utf-8')
        square_content = square_result['content']
        soup = BeautifulSoup(square_content)
        # city
        city_name = tryutf8(
            soup.find_all('title')[0].string.split('的广场'.decode('utf-8'))[0])
        # day
        day_number = int(
            regex_find(
                r'[0-9]+',
                BeautifulSoup(str(soup.find_all(
                    'div', class_='Clock')[0])).find_all('p')[1].string)[0])
        # district
        soup_districts = soup.find_all('div', class_='District', href=True)
        for district_item in soup_districts: