Exemple #1
0
def get_cst():
    from opener import Opener
    reg = re.compile(r'nyear=(?P<year>\d+).*?nmonth=(?P<month>\d+).*?nday=(?P<day>\d+).*?nwday=(\d+).*?nhrs=(?P<hour>\d+).*?nmin=(?P<minute>\d+).*?nsec=(?P<second>\d+)', re.S)

    opener = Opener(encoding='utf8')
    content = opener.urlopen('http://www.beijing-time.org/time.asp', times=0)
    search_obj = reg.search(content)
    return search_obj and datetime.datetime(**dict(((item[0], int(item[1])) for item in search_obj.groupdict().items()))) or datetime.datetime.now()
 def __init__(self):
     logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
     self.model = CorpModel
     self.invalid_code_model = InvalidCodeModel
     self.max_code_model = MaxCodeModel
     self.info_from = '广东红盾网'
     self.charset = 'gbk'
     self.opener = Opener(encoding=self.charset)
     # 6位机关代码; 6位
     self.org_code = '441900'
     # 企业性质号码; 2位, 00~30为内资;40~50为外资.
     self.nature_num = 0
     # 流水号; 4位
     self.ord_num= 1
     self.corp_url = 'http://wsnj.gdgs.gov.cn/aiccps/SearchServlet?service=getEntityInfoByPage&registerNo=%s'
     self.search_text_reg = re.compile(r'^<table width="100%" border="0"><tr><td align=center height=200 >找不到相关的数据\.\.</td></tr></table>$')
     self.regs = [
         re.compile(r'<td align=left width=100% colspan=6 height=25>(?P<name>[^<]+)', re.S),
         re.compile(r'址:</td><td align=left valign=top colspan=5>(?P<addr>[^<]*)', re.S),
         re.compile(r'号:</td><td align=left valign=top><font color="red">(?P<register_code>[^<]*)', re.S),
         re.compile(r'[名人]:</td><td align=left valign=top colspan=3>(?P<representative>[^<]*)', re.S),
         re.compile(r'型:</td><td align=left valign=top>(?P<nature>[^<]*)', re.S),
         re.compile(r'限:</td><td align=left valign=top colspan=3>(?P<period>[^<]*)', re.S),
         re.compile(r'本:</td><td align=left valign=top>(?P<capital>[^<]*)', re.S),
         re.compile(r'关:</td><td align=left valign=top colspan=3>(?P<register_department>[^<]*)', re.S),
         re.compile(r'态:</td><td align=left valign=top>(?P<status>[^<]*)', re.S),
         re.compile(r'期:</td><td align=left valign=top colspan=3>(?P<establishment_data>[^<]+)', re.S),
         re.compile(r'围:</td><td align=left valign=top colspan=5>(?P<scope>[^<]*)', re.S),
     ]
     self._save_times = 0
     self._today = get_cst()
Exemple #3
0
def get_reader(username=None, password=None, use_cookie_file=False):
    reader = Reader(Opener())
    if 'cookies' in session:
        print "Loading cookies"
        reader.opener.load_cookies(session['cookies'])
    elif use_cookie_file:
        print "Loading cookies from file"
        with open(tmp_dir + "cookies.txt", "r") as text_file:
            cookies = text_file.read()
            print cookies
            reader.opener.load_cookies(cookies)
    elif username is None:
        print "Cannot login, no username provided"
        return (None, "No username provided")
    else:
        print "Logging in as ", username
        reader.init()

        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)

        with open(tmp_dir + "cookies.txt", "wb") as text_file:
            text_file.write(reader.opener.get_cookies())

        result = reader.login(username, password)

        if "The userID or password could not be validated" in result:
            print "Bad User ID or password"
            return (None, "Bad User ID or password")

        if "Concurrent Login Error" in result:
            print "User already logged in"
            return (None, "User already logged in")

        print "Logged in"
    return (reader, "")
Exemple #4
0
    def __init__(self, corplist_url, corp_url, info_from, corplist_post_data=None, corp_post_data=None, corplist_reg=None, corp_regs=[], timeout=5, commit_each_times=30, has_cookie=True, charset='utf8', model=None):
        """ 参数 corplist_url 和 corp_url 取胜字符串的高级格式化:format, 使用{0},{1}等通配符; """
        super().__init__()
        self.charset = charset
        self.info_from = info_from
        self.corplist_url = corplist_url
        self.corp_url = corp_url
        self.opener = Opener(has_cookie=has_cookie, encoding=self.charset)

        self.corplist_post_data = corplist_post_data
        self.corp_post_data = corp_post_data
        self.corplist_reg = corplist_reg
        self.corp_regs = corp_regs
        self.commit_each_times = commit_each_times
        self.timeout = timeout

        if model:
            self.model = model
        else:
            from lib.models import CorpModel
            self.model = CorpModel

        #self._today = get_cst()
        self._today = datetime.date.today()
    def __init__(self):
        logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
        self.model = CorpModel
        self.invalid_code_model = InvalidCodeModel
        self.max_code_model = MaxCodeModel
        self.info_from = '广东红盾网'
        self.charset = 'utf8'
        self.opener = Opener(has_cookie=True, encoding=self.charset)
        # 6位机关代码; 6位
        self.org_code = '440101'
        # 企业性质号码; 2位, 00~30为内资;40~50为外资.
        self.nature_num = 0
        # 流水号; 4位
        self.ord_num= 0
        self.query_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseopeninfo.aspx'
        self.corp_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseOpeninfoDetail.aspx?key={0}'
        self.search_string = '查看'
        self.reg = re.compile(r'lbSREGNO">(?P<register_code>[^<]*)</span></td>.*?lbSNAME">(?P<name>[^<]*)</span></td>.*?lbSDOM">(?P<addr>[^<]*)</span></td>.*?lbSMAN">(?P<representative>[^<]*)</span></td>.*?lbSENTCLASS">\s*(?P<nature>[^<]*)</span></td>.*?lbSREGCAP">(?P<capital>[^<]*)</span></td>.*?lbSSSZB">[^<]*</span></td>.*?lbSOPSCOPE">(?P<scope>[^<]*)</span></td>.*?LbSREGRDATE">(?P<establishment_data>[^<]*)</span></td>.*?lbSAPPRDATE">[^<]*</span></td>.*?lbSREGORG">(?P<register_department>[^<]*)</span></td>', re.S)
        self.event_regs = [
            re.compile(r'__VIEWSTATE" value="(?P<__VIEWSTATE>[^"]*)'),
            re.compile(r'__EVENTVALIDATION" value="(?P<__EVENTVALIDATION>[^"]*)'),
        ]
        self.key_reg = re.compile(r'key=(?P<key>[^\']*)')
        self.post_data = {
            '__EVENTTARGET': 'QueryButton',
            '__EVENTARGUMENT': '',
            '__VIEWSTATE': '',
            '__EVENTVALIDATION': '',
            'txtName': '',
            'txtReg': '',
        }

        self._save_times = 0
        self._today = get_cst()

        self.initial()
Exemple #6
0
from bs4 import BeautifulSoup
from opener import Opener
from reader import Reader
import os
import parser

# TODO - Detect "Your query returned more than 200 records"

first = "GEORGE"
last = "BROWN"

print os.environ['username'], os.environ['password']

reader = Reader(Opener())
reader.init()
result = reader.login(os.environ['username'], os.environ['password'])
if "The userID or password could not be validated" in result:
    print "Login failed"
    exit()

try:
    print "Searching"
    result = reader.search(first, last)
    cases = parser.parse_search(result)

    for case in cases:
        print "Collecting " + case['id']
        result = reader.case_summary(case['id'])
        parser.parse_case_summary(result, case)

        result = reader.case_charges()
 def __init__(self):
     self.opener = Opener('district')
     self.use_driver = True
class DistrictCourtOpener:
    url_root = 'https://eapps.courts.state.va.us/gdcourts/'

    def __init__(self):
        self.opener = Opener('district')
        self.use_driver = True

    def url(self, url):
        return DistrictCourtOpener.url_root + url

    def log_off(self):
        return None

    def open_driver(self):
        self.driver = webdriver.Chrome('./chromedriver')
        self.driver.implicitly_wait(3)
        self.driver_open = True

    def open_welcome_page(self):
        url = self.url('caseSearch.do?welcomePage=welcomePage')
        page = self.opener.open('https://google.com')
        page_content = page.read()
        page = self.opener.open(url)
        page_content = page.read()
        # See if we need to solve a captcha
        if 'By clicking Accept' in page_content:
            self.solve_captcha(url)
            page = self.opener.open(url)
            page_content = page.read()
        if 'By clicking Accept' in page_content:
            raise RuntimeError('CAPTCHA failed')
        return BeautifulSoup(page_content, 'html.parser')

    def solve_captcha(self, url):
        self.open_driver()
        self.driver.get(url)
        time.sleep(1)
        current_url = self.driver.current_url
        print current_url

        while current_url == self.driver.current_url:
            time.sleep(1)
        '''
        log.info('Solving CAPTCHA')
        captcha_solver = deathbycaptcha.SocketClient(os.environ['DBC_USER'], \
                                                     os.environ['DBC_PASSWORD'])
        self.driver.get(url)
        captcha = self.driver.find_element_by_id('recaptcha_challenge_image')
        image_src = captcha.get_attribute('src')
        image_filename = str(os.getpid()) + '_captcha.png'
        urllib.urlretrieve(image_src, image_filename)
        try:
            captcha_solution = captcha_solver.decode(image_filename, 60)
            #captcha_solution = {'captcha': 'manual', 'text': raw_input('Enter CAPTCHA:')}
            if captcha_solution:
                log.info('CAPTCHA SOLVED')
                print "CAPTCHA %s solved: %s" % (captcha_solution["captcha"],
                                                 captcha_solution["text"])
                self.driver.find_element_by_name('recaptcha_response_field') \
                      .send_keys(captcha_solution["text"])
                os.remove(image_filename)
        except deathbycaptcha.AccessDeniedException:
            log.error('deathbycaptcha access denied')
            print 'deathbycaptcha access denied'
        time.sleep(1)
        self.driver.find_element_by_name('captchaVerificationForm') \
              .submit()
        '''
        cookie = self.driver.get_cookie('JSESSIONID')['value']
        self.opener.set_cookie('JSESSIONID', cookie)
        self.opener.save_cookies()
        self.driver.quit()

    def change_court(self, name, code):
        data = urllib.urlencode({
            'selectedCourtsName': name,
            'selectedCourtsFipCode': code,
            'sessionCourtsFipCode': ''
        })
        url = self.url('changeCourt.do')
        self.opener.open(url, data)

    def open_hearing_date_search(self, code, search_division):
        url = self.url('caseSearch.do')
        url += '?fromSidebar=true&searchLanding=searchLanding'
        url += '&searchType=hearingDate&searchDivision=' + search_division
        url += '&searchFipsCode=' + code
        url += '&curentFipsCode=' + code
        self.opener.open(url)

    def do_hearing_date_search(self, code, date, first_page):
        data = {
            'formAction': '',
            'curentFipsCode': code,
            'searchTerm': date,
            'searchHearingTime': '',
            'searchCourtroom': '',
            'lastName': '',
            'firstName': '',
            'middleName': '',
            'suffix': '',
            'searchHearingType': '',
            'searchUnitNumber': '',
            'searchFipsCode': code
        }
        if first_page:
            data['caseSearch'] = 'Search'
        else:
            data['caseInfoScrollForward'] = 'Next'
            data['unCheckedCases'] = ''
        data = urllib.urlencode(data)
        url = self.url('caseSearch.do')
        page = self.opener.open(url, data)
        content = ''
        for line in page:
            if '<a href="caseSearch.do?formAction=caseDetails' in line:
                line = line.replace('/>', '>')
            content += line
        soup = BeautifulSoup(content, 'html.parser')
        return soup

    def open_case_number_search(self, code, search_division):
        url = self.url('criminalCivilCaseSearch.do')
        url += '?fromSidebar=true&formAction=searchLanding&searchDivision=' + search_division
        url += '&searchFipsCode=' + code
        url += '&curentFipsCode=' + code
        self.opener.open(url)

    def do_case_number_search(self, code, case_number, search_division):
        data = {
            'formAction': 'submitCase',
            'searchFipsCode': code,
            'searchDivision': search_division,
            'searchType': 'caseNumber',
            'displayCaseNumber': case_number,
            'localFipsCode': code,
            'clientSearchCounter': 0
        }
        data = urllib.urlencode(data)
        url = self.url('criminalCivilCaseSearch.do')
        self.opener.open(url, data)
        # the post returns 302, then we have to do a GET... strange

        url = self.url('criminalDetail.do')
        content = self.opener.open(url)
        soup = BeautifulSoup(content, 'html.parser')
        return soup

    def open_case_details(self, details_url):
        url = self.url(details_url)
        page = self.opener.open(url)
        return BeautifulSoup(page.read(), 'html.parser')

    def open_name_search(self, code, search_division):
        url = self.url('nameSearch.do')
        url += '?fromSidebar=true&formAction=searchLanding&searchDivision=' + search_division
        url += '&searchFipsCode=' + code
        url += '&curentFipsCode=' + code
        if self.use_driver:
            self.driver.get(url)
        else:
            self.opener.open(url)

    def do_name_search_with_driver(self, code, name, count, prev_cases):
        if prev_cases:
            xpath = "//input[@value='Next'][@type='submit']"
            self.driver.find_element_by_xpath(xpath).click()
        else:
            self.driver.find_element_by_name('localnamesearchlastName') \
                .send_keys(name)
            xpath = "//input[@value='Search'][@type='submit']"
            self.driver.find_element_by_xpath(xpath).click()
        time.sleep(1)
        source = self.driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        return soup

    def do_name_search(self,
                       code,
                       search_division,
                       name,
                       count,
                       prev_cases=None):
        if self.use_driver:
            return self.do_name_search_with_driver(code, name, count,
                                                   prev_cases)
        data = {
            'formAction': 'newSearch',
            'displayCaseNumber': '',
            'formBean': '',
            'localFipsCode': code,
            'caseActive': '',
            'localLastName': '',
            'forward': '',
            'back': '',
            'localnamesearchlastName': name,
            'lastName': name,
            'localnamesearchfirstName': '',
            'firstName': '',
            'localnamesearchmiddleName': '',
            'middleName': '',
            'localnamesearchsuffix': '',
            'suffix': '',
            'localnamesearchsearchCategory': 'A',
            'searchCategory': 'A',
            'searchFipsCode': code,
            'searchDivision': search_division,
            'searchType': 'name',
            'firstRowName': '',
            'firstRowCaseNumber': '',
            'lastRowName': '',
            'lastRowCaseNumber': '',
            'clientSearchCounter': count
        }
        if prev_cases:
            data['formAction'] = 'next'
            data['unCheckedCases'] = ''
            data['firstRowName'] = prev_cases[0]['defendant']
            data['firstRowCaseNumber'] = prev_cases[0]['case_number']
            data['lastRowName'] = prev_cases[-1]['defendant']
            data['lastRowCaseNumber'] = prev_cases[-1]['case_number']
        data = urllib.urlencode(data)
        url = self.url('nameSearch.do')
        content = self.opener.open(url, data)
        soup = BeautifulSoup(content, 'html.parser')
        return soup
 def __init__(self):
     self.opener = Opener('circuit')
class CircuitCourtOpener:
    url_root = 'http://ewsocis1.courts.state.va.us/CJISWeb/'

    def __init__(self):
        self.opener = Opener('circuit')

    def url(self, url):
        return CircuitCourtOpener.url_root + url

    def open_welcome_page(self):
        url = self.url('circuit.jsp')
        page = self.opener.open(url)
        return BeautifulSoup(page.read(), 'html.parser')

    def log_off(self):
        data = urllib.urlencode({'searchType': ''})
        url = self.url('Logoff.do')
        self.opener.open(url, data)

    def change_court(self, code, court):
        data = urllib.urlencode({
            'courtId': code,
            'courtType': 'C',
            'caseType': 'ALL',
            'testdos': False,
            'sessionCreate': 'NEW',
            'whichsystem': court
        })
        url = self.url('MainMenu.do')
        self.opener.open(url, data)

    def do_case_number_search(self, code, case_number, category):
        data = {
            'courtId':code,
            'caseNo':case_number,
            'categorySelected':category
        }
        data = urllib.urlencode(data)
        url = self.url('CaseDetail.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def do_name_search(self, code, name, category):
        data = {
            'category': category,
            'lastName': name,
            'courtId': code,
            'submitValue': 'N'
        }
        data = urllib.urlencode(data)
        url = self.url('Search.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def continue_name_search(self, code, category):
        data = {
            'courtId': code,
            'pagelink': 'Next',
            'lastCaseProcessed': '',
            'firstCaseProcessed': '',
            'lastNameProcessed': '',
            'firstNameProcessed': '',
            'category': category,
            'firstCaseSerialNumber': 0,
            'lastCaseSerialNumber': 0,
            'searchType': '',
            'emptyList': ''
        }
        data = urllib.urlencode(data)
        url = self.url('Search.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def do_date_search(self, code, date, category):
        data = {
            'hearSelect':'',
            'selectDate':date,
            'categorySelected':category,
            'hearDateSelected':date,
            'submitValue':'',
            'courtId':code
        }
        data = urllib.urlencode(data)
        url = self.url('hearSearch.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def continue_date_search(self, code, category):
        data = {
            'courtId': code,
            'pagelink': 'Next',
            'lastCaseProcessed': '',
            'firstCaseProcessed': '',
            'category': category,
            'firstCaseSerialNumber': 0,
            'lastCaseSerialNumber': 0,
            'searchType': '',
            'emptyList': ''
        }
        data = urllib.urlencode(data)
        url = self.url('hearSearch.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')
 def __init__(self):
     self.opener = Opener()
class DistrictCourtOpener:
    url_root = 'https://eapps.courts.state.va.us/gdcourts/'
    
    def __init__(self):
        self.opener = Opener()
    
    def url(self, url):
        return DistrictCourtOpener.url_root + url;
    
    def open_welcome_page(self):
        url = self.url('caseSearch.do?welcomePage=welcomePage')
        page = self.opener.open(url)
        page_content = page.read()
        soup = BeautifulSoup(page_content)
        
        # See if we need to solve a captcha
        if 'By clicking Accept' in page_content:
            captcha_url = self.url('captchaVerification.do')
            captcha.solve(self.opener, captcha_url)
            
            page = self.opener.open(url)
            soup = BeautifulSoup(page.read())
        return soup
    
    def change_court(self, name, code):
        data = urllib.urlencode({
            'selectedCourtsName': name,
            'selectedCourtsFipCode': code,
            'sessionCourtsFipCode': ''
        })
        url = self.url('changeCourt.do')
        self.opener.open(url, data)
    
    def open_hearing_date_search(self, code):
        url = self.url('caseSearch.do')
        url += '?fromSidebar=true&searchLanding=searchLanding'
        url += '&searchType=hearingDate&searchDivision=T&'
        url += '&searchFipsCode=' + code
        url += '&curentFipsCode=' + code
        self.opener.open(url)
    
    def do_hearing_date_search(self, code, date, first_page):
        data = {
            'formAction':'',
            'curentFipsCode':code,
            'searchTerm':date,
            'searchHearingTime':'',
            'searchCourtroom':'',
            'lastName':'',
            'firstName':'',
            'middleName':'',
            'suffix':'',
            'searchHearingType':'',
            'searchUnitNumber':'',
            'searchFipsCode':code
        }
        if first_page:
            data['caseSearch'] = 'Search'
        else:
            data['caseInfoScrollForward'] = 'Next'
            data['unCheckedCases'] = ''
        
        data = urllib.urlencode(data)
        url = self.url('caseSearch.do')
        page = self.opener.open(url, data)
        content = ''
        for line in page:
            if '<a href="caseSearch.do?formAction=caseDetails' in line:
                line = line.replace('/>', '>')
            content += line
        soup = BeautifulSoup(content)
        return soup
    
    def open_case_details(self, case):
        url = self.url(case['details_url'])
        page = self.opener.open(url)
        return BeautifulSoup(page.read())
class CircuitCourtOpener:
    url_root = 'http://ewsocis1.courts.state.va.us/CJISWeb/'

    def __init__(self):
        self.opener = Opener('circuit')

    def url(self, url):
        return CircuitCourtOpener.url_root + url

    def open_welcome_page(self):
        url = self.url('circuit.jsp')
        page = self.opener.open(url)
        return BeautifulSoup(page.read(), 'html.parser')

    def log_off(self):
        data = urllib.urlencode({'searchType': ''})
        url = self.url('Logoff.do')
        self.opener.open(url, data)

    def change_court(self, code, court):
        data = urllib.urlencode({
            'courtId': code,
            'courtType': 'C',
            'caseType': 'ALL',
            'testdos': False,
            'sessionCreate': 'NEW',
            'whichsystem': court
        })
        url = self.url('MainMenu.do')
        self.opener.open(url, data)

    def do_case_number_search(self, code, case_number, category):
        data = {
            'submitValue': '',
            'courtId': code,
            'caseNo': case_number,
            'categorySelected': category
        }
        data = urllib.urlencode(data)
        url = self.url('CaseDetail.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def do_case_number_pleadings_search(self, code, case_number, category):
        data = {
            'submitValue': 'P',
            'courtId': code,
            'categorySelected': category,
            'caseStatus': 'A',
            'caseNo': case_number
        }
        data = urllib.urlencode(data)
        url = self.url('CaseDetail.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def do_case_number_services_search(self, code, case_number, category):
        data = {
            'submitValue': 'S',
            'courtId': code,
            'categorySelected': category,
            'caseStatus': 'A',
            'caseNo': case_number
        }
        data = urllib.urlencode(data)
        url = self.url('CaseDetail.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def return_to_main_menu(self, code):
        data = {'courtId': code}
        data = urllib.urlencode(data)
        url = self.url('MainMenu.do')
        self.opener.open(url, data)
        return

    def do_name_search(self, code, name, category):
        data = {
            'category': category,
            'lastName': name,
            'courtId': code,
            'submitValue': 'N'
        }
        data = urllib.urlencode(data)
        url = self.url('Search.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def continue_name_search(self, code, category):
        data = {
            'courtId': code,
            'pagelink': 'Next',
            'lastCaseProcessed': '',
            'firstCaseProcessed': '',
            'lastNameProcessed': '',
            'firstNameProcessed': '',
            'category': category,
            'firstCaseSerialNumber': 0,
            'lastCaseSerialNumber': 0,
            'searchType': '',
            'emptyList': ''
        }
        data = urllib.urlencode(data)
        url = self.url('Search.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def do_date_search(self, code, date, category):
        data = {
            'hearSelect': '',
            'selectDate': date,
            'categorySelected': category,
            'hearDateSelected': date,
            'submitValue': '',
            'courtId': code
        }
        data = urllib.urlencode(data)
        url = self.url('hearSearch.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')

    def continue_date_search(self, code, category):
        data = {
            'courtId': code,
            'pagelink': 'Next',
            'lastCaseProcessed': '',
            'firstCaseProcessed': '',
            'category': category,
            'firstCaseSerialNumber': 0,
            'lastCaseSerialNumber': 0,
            'searchType': '',
            'emptyList': ''
        }
        data = urllib.urlencode(data)
        url = self.url('hearSearch.do')
        page = self.opener.open(url, data)
        return BeautifulSoup(page.read(), 'html.parser')
class Register_corp:
    def __init__(self):
        logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
        self.model = CorpModel
        self.invalid_code_model = InvalidCodeModel
        self.max_code_model = MaxCodeModel
        self.info_from = '广东红盾网'
        self.charset = 'gbk'
        self.opener = Opener(encoding=self.charset)
        # 6位机关代码; 6位
        self.org_code = '441900'
        # 企业性质号码; 2位, 00~30为内资;40~50为外资.
        self.nature_num = 0
        # 流水号; 4位
        self.ord_num= 1
        self.corp_url = 'http://wsnj.gdgs.gov.cn/aiccps/SearchServlet?service=getEntityInfoByPage&registerNo=%s'
        self.search_text_reg = re.compile(r'^<table width="100%" border="0"><tr><td align=center height=200 >找不到相关的数据\.\.</td></tr></table>$')
        self.regs = [
            re.compile(r'<td align=left width=100% colspan=6 height=25>(?P<name>[^<]+)', re.S),
            re.compile(r'址:</td><td align=left valign=top colspan=5>(?P<addr>[^<]*)', re.S),
            re.compile(r'号:</td><td align=left valign=top><font color="red">(?P<register_code>[^<]*)', re.S),
            re.compile(r'[名人]:</td><td align=left valign=top colspan=3>(?P<representative>[^<]*)', re.S),
            re.compile(r'型:</td><td align=left valign=top>(?P<nature>[^<]*)', re.S),
            re.compile(r'限:</td><td align=left valign=top colspan=3>(?P<period>[^<]*)', re.S),
            re.compile(r'本:</td><td align=left valign=top>(?P<capital>[^<]*)', re.S),
            re.compile(r'关:</td><td align=left valign=top colspan=3>(?P<register_department>[^<]*)', re.S),
            re.compile(r'态:</td><td align=left valign=top>(?P<status>[^<]*)', re.S),
            re.compile(r'期:</td><td align=left valign=top colspan=3>(?P<establishment_data>[^<]+)', re.S),
            re.compile(r'围:</td><td align=left valign=top colspan=5>(?P<scope>[^<]*)', re.S),
        ]
        self._save_times = 0
        self._today = get_cst()

    def _msg(self, text):
        return '%s %s' % (time.asctime(), text)

    def _process(self, corp_dict):
        corp_dict['insert_date'] = self._today
        corp_dict['info_from'] = self.info_from
        if 'establishment_data' in corp_dict and corp_dict['establishment_data']:
            corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data'])
        return corp_dict

    def calc_code15(self, org_code, nature_num, ord_num):
        """ 计算15位注册号的检验码. 并返回15位注册号. """
        code14 = '%s%02d%06d' % (org_code, nature_num, ord_num)
        temp =  reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10)
        return '%s%s' % (code14, (11-temp)%10)

    def calc_code13(self, org_code, nature_num, ord_num):
        """ 返回13位注册号. """
        return '%s%s%04d' % (org_code, nature_num, ord_num)

    def init_invalid_codes(self):
        i = 0
        for query_obj in self.max_code_model.get_all():
            org_code, nature_num, max_ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num
            for ord_num in range(0, max_ord_num):
                register_code = self.calc_code15(org_code, nature_num, ord_num)
                if self.model.exists_by(register_code=register_code):
                    continue
                self.invalid_code_model.add({'register_code': register_code}, is_commit=False)
                i += 1
                if not i%200:
                    self.model.commit()
                    print('Save 200 invalid codes!')
        self.model.commit()

    def fetch(self, code):
        print('###############################################################')
        print(self._msg('注册码: %s' % code))
        url = self.corp_url % code
        content = self.opener.urlopen(url, timeout=10, times=0)
        #if content.find(self.search_text) < 0:
        if self.search_text_reg.match(content):
            print('没找到相关信息.')
            return []
        result = []
        temp = {}
        for search_obj in (reg.search(content) for reg in self.regs):
            if search_obj:
                temp.update(search_obj.groupdict())
        print(temp)
        result.append(temp)
        if not result:
            logging.info('register code: %s' % code)
        return result

    def save(self, register_code):
        self._save_times = (self._save_times + 1) % 101
        self._save_times or self.model.commit()
        corps = self.fetch(register_code)
        if not corps:
            return False
        for corp in corps:
            self.model.add(self._process(corp), is_commit=False)
            print('添加成功!')
        print('###############################################################')
        return True

    def action(self):
        invalid_times = 0
        while self.nature_num<=99:
            if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)):
                invalid_times = 0
            else:
                invalid_times += 1
            self.ord_num += 1
            if invalid_times >= 500:
                invalid_times = 0
                self.ord_num = 0
                if self.nature_num >=40:
                    self.nature_num = 0
                    self.org_code = str(int(self.org_code)+1)
                    if int(self.org_code)>=440200:
                        break
                else:
                    self.nature_num = 40
                logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num))
        self.model.commit()

    def action_test(self):
        pass

    def action_new(self, invalid_times=20):
        for query_obj in self.max_code_model.get_all():
            org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1
            times = 0
            while times<invalid_times:
                if self.save(self.calc_code15(org_code, nature_num, ord_num)):
                    times = 0
                    query_obj.ord_num = ord_num
                    # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件.
                    self._save_times or self.model.commit()
                else:
                    times += 1
                ord_num += 1
            self.model.commit()

    def action_from_invalid_codes(self):
        for query_obj in self.invalid_code_model.get_all():
            if self.save(query_obj.register_code):
                query_obj.delete()
        self.model.commit()

    def action_from_file(self):
        f = open('others.txt')
        for line in f:
            code = line[:-1]
            self.save(code)
        f.close()
        self.model.commit()

    def report(self):
        corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today())
        #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,11,3))
        rows = []
        fields = (
            ('名称', 'name'),
            ('注册码', 'register_code'),
            ('地址', 'addr'),
            ('经营范围', 'scope'),
            ('注册资金', 'capital'),
            ('成立日期', 'establishment_data'),
            ('企业性质', 'nature'),
            ('法人', 'representative'),
            ('企业状态', 'status'),
            ('期限', 'period'),
            ('登记单位', 'register_department'),
            ('信息来源', 'info_from'),
            ('更新日期', 'insert_date'),
        )
        self.model.report('东莞红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')
Exemple #15
0
class Corp(multiprocessing.Process):
    def __init__(self, corplist_url, corp_url, info_from, corplist_post_data=None, corp_post_data=None, corplist_reg=None, corp_regs=[], timeout=5, commit_each_times=30, has_cookie=True, charset='utf8', model=None):
        """ 参数 corplist_url 和 corp_url 取胜字符串的高级格式化:format, 使用{0},{1}等通配符; """
        super().__init__()
        self.charset = charset
        self.info_from = info_from
        self.corplist_url = corplist_url
        self.corp_url = corp_url
        self.opener = Opener(has_cookie=has_cookie, encoding=self.charset)

        self.corplist_post_data = corplist_post_data
        self.corp_post_data = corp_post_data
        self.corplist_reg = corplist_reg
        self.corp_regs = corp_regs
        self.commit_each_times = commit_each_times
        self.timeout = timeout

        if model:
            self.model = model
        else:
            from lib.models import CorpModel
            self.model = CorpModel

        #self._today = get_cst()
        self._today = datetime.date.today()

    def _msg(self, msg=''):
        #print('%s %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg))
        logging.info(msg)

    def set_queue(self, queue):
        self.queue = queue


    def process_corp_info(self, corp_info, date_reg=r'(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+)'):
        for key, values in corp_info.items():
            corp_info[key] = values.strip()
        if 'insert_date' in corp_info:
            corp_info['insert_date'] = self.model._str2date(corp_info['insert_date'], date_reg=date_reg)
        else:
            corp_info['insert_date'] = self._today
        return corp_info

    def get_next_page_url(self):
        """ 必须是一个非协程的Generator, 或者返回一个iterable. """
        return (self.corplist_url)

    def get_corp_url(self, corp_info={}):
        return self.corp_url.format(**corp_info)

    def prepare(self):
        pass

    def fetch_corplist(self, page_url):
        """ 如果成功抓取, 返回一个包含 Corp Info dict 的列表或者iterable; 否则返回 {}. """
        content = self.opener.urlopen(page_url, data=self.corplist_post_data, timeout=self.timeout, times=0)
        return ({} if not search_obj else search_obj.groupdict() for search_obj in self.corplist_reg.finditer(content))

    def fetch_corp(self, corp_info=None):
        """ 如果成功抓取, 返回一个Corp Info 的 dict. """
        corp_url = self.get_corp_url(corp_info)
        content = self.opener.urlopen(corp_url, data=self.corp_post_data, timeout=self.timeout, times=0)
        for reg in self.corp_regs:
            search_obj = reg.search(content)
            search_obj and corp_info.update(search_obj.groupdict())
        return corp_info

    def before_save(self, corp_info):
        corp_info = self.process_corp_info(corp_info)
        corp_info['info_from'] = self.info_from
        return corp_info

    def commit(self):
        self.model.commit()

    @coroutine
    def check_exists(self):
        """ Generator, 存在的话返回其 info_from, 否则返回 None. """
        corp_names_cache = {}
        corp_names_cache_list = []
        cache_length = 0
        result = None
        while 1:
            corp_info = (yield result)
            result = None
            corp_name = corp_info['name'].strip()
            if corp_name not in corp_names_cache:
                corp_names_cache[corp_name] = self.info_from
                corp_names_cache_list.insert(0,corp_name)
                cache_length += 1
                if cache_length > self.commit_each_times:
                    del corp_names_cache[corp_names_cache_list.pop()]
                    cache_length -= 1
                exists_corp = self.model.filter_by(name=corp_name).first()
                if exists_corp:
                    result = exists_corp.info_from
                    corp_names_cache[corp_name] = result
            else:
                result = corp_names_cache[corp_name]

    def run(self):
        self.prepare()
        check_exists = self.check_exists()
        cur_page = itertools.count()
        for page_url in self.get_next_page_url():
            print('\n%s 第%s页' % (self.info_from, next(cur_page)+1))
            for corp_info in self.fetch_corplist(page_url):
                self._msg('***************************************************')
                print(corp_info['name'], end=' ')
                info_from = check_exists.send(corp_info)
                if not info_from:
                    if self.corp_regs:
                        corp_info = self.fetch_corp(corp_info)
                    corp_info = self.before_save(corp_info)
                    self.queue.put(corp_info)
                    print('保存成功!')
                else:
                    print('已经存在于: %s' % info_from)
        self._msg('\n%s 抓取完毕!' % self.info_from)
        self.queue.put(None)

    def report(self, fields=None):
        corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date.today())
        #corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date(2011,12,8))
        fields = fields or (
            ('名称', 'name'),
            ('地址', 'addr'),
            ('联系人', 'contact_person'),
            ('区号', 'contact_tel_code'),
            ('电话号码', 'contact_tel_no'),
            ('邮箱', 'mail'),
            ('网址', 'website'),
            ('信息来源', 'info_from'),
            ('更新日期', 'insert_date'),
            ('链接', self.corp_url),
        )
        self.model.report('%s最新公司信息_%s.csv' % (self.info_from, time.strftime('%Y-%m-%d')), fields=fields, rows=corps, encoder='gbk')
class Register_corp:
    def __init__(self):
        logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
        self.model = CorpModel
        self.invalid_code_model = InvalidCodeModel
        self.max_code_model = MaxCodeModel
        self.info_from = '广东红盾网'
        self.charset = 'utf8'
        self.opener = Opener(has_cookie=True, encoding=self.charset)
        # 6位机关代码; 6位
        self.org_code = '440101'
        # 企业性质号码; 2位, 00~30为内资;40~50为外资.
        self.nature_num = 0
        # 流水号; 4位
        self.ord_num= 0
        self.query_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseopeninfo.aspx'
        self.corp_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseOpeninfoDetail.aspx?key={0}'
        self.search_string = '查看'
        self.reg = re.compile(r'lbSREGNO">(?P<register_code>[^<]*)</span></td>.*?lbSNAME">(?P<name>[^<]*)</span></td>.*?lbSDOM">(?P<addr>[^<]*)</span></td>.*?lbSMAN">(?P<representative>[^<]*)</span></td>.*?lbSENTCLASS">\s*(?P<nature>[^<]*)</span></td>.*?lbSREGCAP">(?P<capital>[^<]*)</span></td>.*?lbSSSZB">[^<]*</span></td>.*?lbSOPSCOPE">(?P<scope>[^<]*)</span></td>.*?LbSREGRDATE">(?P<establishment_data>[^<]*)</span></td>.*?lbSAPPRDATE">[^<]*</span></td>.*?lbSREGORG">(?P<register_department>[^<]*)</span></td>', re.S)
        self.event_regs = [
            re.compile(r'__VIEWSTATE" value="(?P<__VIEWSTATE>[^"]*)'),
            re.compile(r'__EVENTVALIDATION" value="(?P<__EVENTVALIDATION>[^"]*)'),
        ]
        self.key_reg = re.compile(r'key=(?P<key>[^\']*)')
        self.post_data = {
            '__EVENTTARGET': 'QueryButton',
            '__EVENTARGUMENT': '',
            '__VIEWSTATE': '',
            '__EVENTVALIDATION': '',
            'txtName': '',
            'txtReg': '',
        }

        self._save_times = 0
        self._today = get_cst()

        self.initial()

    # 初始化 ufps
    def initial(self):
        self.fetch_content(self.query_url)

    def _msg(self, text):
        return '%s %s' % (time.asctime(), text)

    def _process(self, corp_dict):
        corp_dict.update({
            'insert_date': self._today,
            'info_from': self.info_from,
            'status': '登记成立',
            'period': '长期',
        })
        if 'establishment_data' in corp_dict and corp_dict['establishment_data']:
            corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data'])
        return corp_dict

    def calc_code15(self, org_code, nature_num, ord_num):
        """ 计算15位注册号的检验码. 并返回15位注册号. """
        code14 = '%s%02d%06d' % (org_code, nature_num, ord_num)
        temp =  reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10)
        return '%s%s' % (code14, (11-temp)%10)

    def update_events(self, content):
        for reg in self.event_regs:
            search_obj = reg.search(content)
            search_obj and self.post_data.update(search_obj.groupdict())

    def fetch_content(self, url, data=None, timeout=10, raw_string=False, update_events=True):
        content = self.opener.urlopen(url, data, timeout=timeout, times=0)
        update_events and self.update_events(content)
        return content

    def fetch_query_content(self, code):
        self.post_data.update({
            '__EVENTTARGET': 'QueryButton',
            'txtReg': code,
        })
        return self.fetch_content(self.query_url, self.post_data)

    def fetch_key_content(self):
        self.post_data['__EVENTTARGET'] = 'GridView1$ctl02$LinkButton1'
        return self.fetch_content(self.query_url, self.post_data)

    def fetch_corp_key(self, code):
        content = self.fetch_query_content(code)
        if content.find(self.search_string) < 0:
            return None
        content = self.fetch_key_content()
        search_obj = self.key_reg.search(content)
        return search_obj and search_obj.groups()[0]

    def fetch_corp_info(self, code):
        print('###############################################################')
        print(self._msg('注册码: %s' % code))
        corp_key = self.fetch_corp_key(code)
        if corp_key == None:
            print('没找到相关信息.')
            return []
        print('key: %s' % corp_key)
        url = self.corp_url.format(corp_key)
        content = self.fetch_content(url, update_events=False)
        result = []
        search_obj = self.reg.search(content)
        if search_obj:
            search_obj.groupdict() and result.append(search_obj.groupdict())
            print(search_obj.groupdict())
        if not result:
            logging.info('register code: %s' % code)
        return result

    def save(self, register_code):
        self._save_times = (self._save_times + 1) % 101
        self._save_times or self.model.commit()
        corps = self.fetch_corp_info(register_code)
        if not corps:
            return False
        for corp in corps:
            self.model.add(self._process(corp), is_commit=False)
            print('添加成功!')
        print('###############################################################')
        return True

    def action(self):
        invalid_times = 0
        while self.nature_num<=99:
            if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)):
                invalid_times = 0
            else:
                invalid_times += 1
            self.ord_num += 1
            if invalid_times >= 500:
                invalid_times = 0
                self.ord_num = 0
                if self.nature_num >=40:
                    self.nature_num = 0
                    self.org_code = str(int(self.org_code)+1)
                    if int(self.org_code)>=440200:
                        break
                else:
                    self.nature_num = 40
                logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num))
        self.model.commit()

    def action_test(self):
        pass

    def action_new(self, invalid_times=10):
        for query_obj in self.max_code_model.get_all():
            org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1
            times = 0
            while times<invalid_times:
                if self.save(self.calc_code15(org_code, nature_num, ord_num)):
                    times = 0
                    query_obj.ord_num = ord_num
                    # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件.
                    self._save_times or self.model.commit()
                else:
                    times += 1
                ord_num += 1
            self.model.commit()

    def action_from_invalid_codes(self):
        for query_obj in self.invalid_code_model.get_all():
            if self.save(query_obj.register_code):
                query_obj.delete()
        self.model.commit()

    def action_from_file(self):
        f = open('others.txt')
        for line in f:
            code = line[:-1]
            self.save(code)
        f.close()
        self.model.commit()

    def report(self):
        corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today())
        #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,12,8))
        rows = []
        fields = (
            ('名称', 'name'),
            ('注册码', 'register_code'),
            ('地址', 'addr'),
            ('经营范围', 'scope'),
            ('注册资金', 'capital'),
            ('成立日期', 'establishment_data'),
            ('企业性质', 'nature'),
            ('法人', 'representative'),
            ('企业状态', 'status'),
            ('期限', 'period'),
            ('登记单位', 'register_department'),
            ('信息来源', 'info_from'),
            ('更新日期', 'insert_date'),
        )
        self.model.report('广州红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')
 def __init__(self):
     self.opener = Opener('circuit')