def get_cst(): from opener import Opener reg = re.compile(r'nyear=(?P<year>\d+).*?nmonth=(?P<month>\d+).*?nday=(?P<day>\d+).*?nwday=(\d+).*?nhrs=(?P<hour>\d+).*?nmin=(?P<minute>\d+).*?nsec=(?P<second>\d+)', re.S) opener = Opener(encoding='utf8') content = opener.urlopen('http://www.beijing-time.org/time.asp', times=0) search_obj = reg.search(content) return search_obj and datetime.datetime(**dict(((item[0], int(item[1])) for item in search_obj.groupdict().items()))) or datetime.datetime.now()
def __init__(self): logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') self.model = CorpModel self.invalid_code_model = InvalidCodeModel self.max_code_model = MaxCodeModel self.info_from = '广东红盾网' self.charset = 'gbk' self.opener = Opener(encoding=self.charset) # 6位机关代码; 6位 self.org_code = '441900' # 企业性质号码; 2位, 00~30为内资;40~50为外资. self.nature_num = 0 # 流水号; 4位 self.ord_num= 1 self.corp_url = 'http://wsnj.gdgs.gov.cn/aiccps/SearchServlet?service=getEntityInfoByPage®isterNo=%s' self.search_text_reg = re.compile(r'^<table width="100%" border="0"><tr><td align=center height=200 >找不到相关的数据\.\.</td></tr></table>$') self.regs = [ re.compile(r'<td align=left width=100% colspan=6 height=25>(?P<name>[^<]+)', re.S), re.compile(r'址:</td><td align=left valign=top colspan=5>(?P<addr>[^<]*)', re.S), re.compile(r'号:</td><td align=left valign=top><font color="red">(?P<register_code>[^<]*)', re.S), re.compile(r'[名人]:</td><td align=left valign=top colspan=3>(?P<representative>[^<]*)', re.S), re.compile(r'型:</td><td align=left valign=top>(?P<nature>[^<]*)', re.S), re.compile(r'限:</td><td align=left valign=top colspan=3>(?P<period>[^<]*)', re.S), re.compile(r'本:</td><td align=left valign=top>(?P<capital>[^<]*)', re.S), re.compile(r'关:</td><td align=left valign=top colspan=3>(?P<register_department>[^<]*)', re.S), re.compile(r'态:</td><td align=left valign=top>(?P<status>[^<]*)', re.S), re.compile(r'期:</td><td align=left valign=top colspan=3>(?P<establishment_data>[^<]+)', re.S), re.compile(r'围:</td><td align=left valign=top colspan=5>(?P<scope>[^<]*)', re.S), ] self._save_times = 0 self._today = get_cst()
def get_reader(username=None, password=None, use_cookie_file=False): reader = Reader(Opener()) if 'cookies' in session: print "Loading cookies" reader.opener.load_cookies(session['cookies']) elif use_cookie_file: print "Loading cookies from file" with open(tmp_dir + "cookies.txt", "r") as text_file: cookies = text_file.read() print cookies reader.opener.load_cookies(cookies) elif username is None: print "Cannot login, no username provided" return (None, "No username provided") else: print "Logging in as ", username reader.init() if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) with open(tmp_dir + "cookies.txt", "wb") as text_file: text_file.write(reader.opener.get_cookies()) result = reader.login(username, password) if "The userID or password could not be validated" in result: print "Bad User ID or password" return (None, "Bad User ID or password") if "Concurrent Login Error" in result: print "User already logged in" return (None, "User already logged in") print "Logged in" return (reader, "")
def __init__(self, corplist_url, corp_url, info_from, corplist_post_data=None, corp_post_data=None, corplist_reg=None, corp_regs=[], timeout=5, commit_each_times=30, has_cookie=True, charset='utf8', model=None): """ 参数 corplist_url 和 corp_url 取胜字符串的高级格式化:format, 使用{0},{1}等通配符; """ super().__init__() self.charset = charset self.info_from = info_from self.corplist_url = corplist_url self.corp_url = corp_url self.opener = Opener(has_cookie=has_cookie, encoding=self.charset) self.corplist_post_data = corplist_post_data self.corp_post_data = corp_post_data self.corplist_reg = corplist_reg self.corp_regs = corp_regs self.commit_each_times = commit_each_times self.timeout = timeout if model: self.model = model else: from lib.models import CorpModel self.model = CorpModel #self._today = get_cst() self._today = datetime.date.today()
def __init__(self): logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') self.model = CorpModel self.invalid_code_model = InvalidCodeModel self.max_code_model = MaxCodeModel self.info_from = '广东红盾网' self.charset = 'utf8' self.opener = Opener(has_cookie=True, encoding=self.charset) # 6位机关代码; 6位 self.org_code = '440101' # 企业性质号码; 2位, 00~30为内资;40~50为外资. self.nature_num = 0 # 流水号; 4位 self.ord_num= 0 self.query_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseopeninfo.aspx' self.corp_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseOpeninfoDetail.aspx?key={0}' self.search_string = '查看' self.reg = re.compile(r'lbSREGNO">(?P<register_code>[^<]*)</span></td>.*?lbSNAME">(?P<name>[^<]*)</span></td>.*?lbSDOM">(?P<addr>[^<]*)</span></td>.*?lbSMAN">(?P<representative>[^<]*)</span></td>.*?lbSENTCLASS">\s*(?P<nature>[^<]*)</span></td>.*?lbSREGCAP">(?P<capital>[^<]*)</span></td>.*?lbSSSZB">[^<]*</span></td>.*?lbSOPSCOPE">(?P<scope>[^<]*)</span></td>.*?LbSREGRDATE">(?P<establishment_data>[^<]*)</span></td>.*?lbSAPPRDATE">[^<]*</span></td>.*?lbSREGORG">(?P<register_department>[^<]*)</span></td>', re.S) self.event_regs = [ re.compile(r'__VIEWSTATE" value="(?P<__VIEWSTATE>[^"]*)'), re.compile(r'__EVENTVALIDATION" value="(?P<__EVENTVALIDATION>[^"]*)'), ] self.key_reg = re.compile(r'key=(?P<key>[^\']*)') self.post_data = { '__EVENTTARGET': 'QueryButton', '__EVENTARGUMENT': '', '__VIEWSTATE': '', '__EVENTVALIDATION': '', 'txtName': '', 'txtReg': '', } self._save_times = 0 self._today = get_cst() self.initial()
from bs4 import BeautifulSoup from opener import Opener from reader import Reader import os import parser # TODO - Detect "Your query returned more than 200 records" first = "GEORGE" last = "BROWN" print os.environ['username'], os.environ['password'] reader = Reader(Opener()) reader.init() result = reader.login(os.environ['username'], os.environ['password']) if "The userID or password could not be validated" in result: print "Login failed" exit() try: print "Searching" result = reader.search(first, last) cases = parser.parse_search(result) for case in cases: print "Collecting " + case['id'] result = reader.case_summary(case['id']) parser.parse_case_summary(result, case) result = reader.case_charges()
def __init__(self): self.opener = Opener('district') self.use_driver = True
class DistrictCourtOpener: url_root = 'https://eapps.courts.state.va.us/gdcourts/' def __init__(self): self.opener = Opener('district') self.use_driver = True def url(self, url): return DistrictCourtOpener.url_root + url def log_off(self): return None def open_driver(self): self.driver = webdriver.Chrome('./chromedriver') self.driver.implicitly_wait(3) self.driver_open = True def open_welcome_page(self): url = self.url('caseSearch.do?welcomePage=welcomePage') page = self.opener.open('https://google.com') page_content = page.read() page = self.opener.open(url) page_content = page.read() # See if we need to solve a captcha if 'By clicking Accept' in page_content: self.solve_captcha(url) page = self.opener.open(url) page_content = page.read() if 'By clicking Accept' in page_content: raise RuntimeError('CAPTCHA failed') return BeautifulSoup(page_content, 'html.parser') def solve_captcha(self, url): self.open_driver() self.driver.get(url) time.sleep(1) current_url = self.driver.current_url print current_url while current_url == self.driver.current_url: time.sleep(1) ''' log.info('Solving CAPTCHA') captcha_solver = deathbycaptcha.SocketClient(os.environ['DBC_USER'], \ os.environ['DBC_PASSWORD']) self.driver.get(url) captcha = self.driver.find_element_by_id('recaptcha_challenge_image') image_src = captcha.get_attribute('src') image_filename = str(os.getpid()) + '_captcha.png' urllib.urlretrieve(image_src, image_filename) try: captcha_solution = captcha_solver.decode(image_filename, 60) #captcha_solution = {'captcha': 'manual', 'text': raw_input('Enter CAPTCHA:')} if captcha_solution: log.info('CAPTCHA SOLVED') print "CAPTCHA %s solved: %s" % (captcha_solution["captcha"], captcha_solution["text"]) self.driver.find_element_by_name('recaptcha_response_field') \ .send_keys(captcha_solution["text"]) os.remove(image_filename) except deathbycaptcha.AccessDeniedException: log.error('deathbycaptcha access denied') print 'deathbycaptcha access denied' time.sleep(1) self.driver.find_element_by_name('captchaVerificationForm') \ .submit() ''' cookie = self.driver.get_cookie('JSESSIONID')['value'] self.opener.set_cookie('JSESSIONID', cookie) self.opener.save_cookies() self.driver.quit() def change_court(self, name, code): data = urllib.urlencode({ 'selectedCourtsName': name, 'selectedCourtsFipCode': code, 'sessionCourtsFipCode': '' }) url = self.url('changeCourt.do') self.opener.open(url, data) def open_hearing_date_search(self, code, search_division): url = self.url('caseSearch.do') url += '?fromSidebar=true&searchLanding=searchLanding' url += '&searchType=hearingDate&searchDivision=' + search_division url += '&searchFipsCode=' + code url += '&curentFipsCode=' + code self.opener.open(url) def do_hearing_date_search(self, code, date, first_page): data = { 'formAction': '', 'curentFipsCode': code, 'searchTerm': date, 'searchHearingTime': '', 'searchCourtroom': '', 'lastName': '', 'firstName': '', 'middleName': '', 'suffix': '', 'searchHearingType': '', 'searchUnitNumber': '', 'searchFipsCode': code } if first_page: data['caseSearch'] = 'Search' else: data['caseInfoScrollForward'] = 'Next' data['unCheckedCases'] = '' data = urllib.urlencode(data) url = self.url('caseSearch.do') page = self.opener.open(url, data) content = '' for line in page: if '<a href="caseSearch.do?formAction=caseDetails' in line: line = line.replace('/>', '>') content += line soup = BeautifulSoup(content, 'html.parser') return soup def open_case_number_search(self, code, search_division): url = self.url('criminalCivilCaseSearch.do') url += '?fromSidebar=true&formAction=searchLanding&searchDivision=' + search_division url += '&searchFipsCode=' + code url += '&curentFipsCode=' + code self.opener.open(url) def do_case_number_search(self, code, case_number, search_division): data = { 'formAction': 'submitCase', 'searchFipsCode': code, 'searchDivision': search_division, 'searchType': 'caseNumber', 'displayCaseNumber': case_number, 'localFipsCode': code, 'clientSearchCounter': 0 } data = urllib.urlencode(data) url = self.url('criminalCivilCaseSearch.do') self.opener.open(url, data) # the post returns 302, then we have to do a GET... strange url = self.url('criminalDetail.do') content = self.opener.open(url) soup = BeautifulSoup(content, 'html.parser') return soup def open_case_details(self, details_url): url = self.url(details_url) page = self.opener.open(url) return BeautifulSoup(page.read(), 'html.parser') def open_name_search(self, code, search_division): url = self.url('nameSearch.do') url += '?fromSidebar=true&formAction=searchLanding&searchDivision=' + search_division url += '&searchFipsCode=' + code url += '&curentFipsCode=' + code if self.use_driver: self.driver.get(url) else: self.opener.open(url) def do_name_search_with_driver(self, code, name, count, prev_cases): if prev_cases: xpath = "//input[@value='Next'][@type='submit']" self.driver.find_element_by_xpath(xpath).click() else: self.driver.find_element_by_name('localnamesearchlastName') \ .send_keys(name) xpath = "//input[@value='Search'][@type='submit']" self.driver.find_element_by_xpath(xpath).click() time.sleep(1) source = self.driver.page_source soup = BeautifulSoup(source, 'html.parser') return soup def do_name_search(self, code, search_division, name, count, prev_cases=None): if self.use_driver: return self.do_name_search_with_driver(code, name, count, prev_cases) data = { 'formAction': 'newSearch', 'displayCaseNumber': '', 'formBean': '', 'localFipsCode': code, 'caseActive': '', 'localLastName': '', 'forward': '', 'back': '', 'localnamesearchlastName': name, 'lastName': name, 'localnamesearchfirstName': '', 'firstName': '', 'localnamesearchmiddleName': '', 'middleName': '', 'localnamesearchsuffix': '', 'suffix': '', 'localnamesearchsearchCategory': 'A', 'searchCategory': 'A', 'searchFipsCode': code, 'searchDivision': search_division, 'searchType': 'name', 'firstRowName': '', 'firstRowCaseNumber': '', 'lastRowName': '', 'lastRowCaseNumber': '', 'clientSearchCounter': count } if prev_cases: data['formAction'] = 'next' data['unCheckedCases'] = '' data['firstRowName'] = prev_cases[0]['defendant'] data['firstRowCaseNumber'] = prev_cases[0]['case_number'] data['lastRowName'] = prev_cases[-1]['defendant'] data['lastRowCaseNumber'] = prev_cases[-1]['case_number'] data = urllib.urlencode(data) url = self.url('nameSearch.do') content = self.opener.open(url, data) soup = BeautifulSoup(content, 'html.parser') return soup
def __init__(self): self.opener = Opener('circuit')
class CircuitCourtOpener: url_root = 'http://ewsocis1.courts.state.va.us/CJISWeb/' def __init__(self): self.opener = Opener('circuit') def url(self, url): return CircuitCourtOpener.url_root + url def open_welcome_page(self): url = self.url('circuit.jsp') page = self.opener.open(url) return BeautifulSoup(page.read(), 'html.parser') def log_off(self): data = urllib.urlencode({'searchType': ''}) url = self.url('Logoff.do') self.opener.open(url, data) def change_court(self, code, court): data = urllib.urlencode({ 'courtId': code, 'courtType': 'C', 'caseType': 'ALL', 'testdos': False, 'sessionCreate': 'NEW', 'whichsystem': court }) url = self.url('MainMenu.do') self.opener.open(url, data) def do_case_number_search(self, code, case_number, category): data = { 'courtId':code, 'caseNo':case_number, 'categorySelected':category } data = urllib.urlencode(data) url = self.url('CaseDetail.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def do_name_search(self, code, name, category): data = { 'category': category, 'lastName': name, 'courtId': code, 'submitValue': 'N' } data = urllib.urlencode(data) url = self.url('Search.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def continue_name_search(self, code, category): data = { 'courtId': code, 'pagelink': 'Next', 'lastCaseProcessed': '', 'firstCaseProcessed': '', 'lastNameProcessed': '', 'firstNameProcessed': '', 'category': category, 'firstCaseSerialNumber': 0, 'lastCaseSerialNumber': 0, 'searchType': '', 'emptyList': '' } data = urllib.urlencode(data) url = self.url('Search.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def do_date_search(self, code, date, category): data = { 'hearSelect':'', 'selectDate':date, 'categorySelected':category, 'hearDateSelected':date, 'submitValue':'', 'courtId':code } data = urllib.urlencode(data) url = self.url('hearSearch.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def continue_date_search(self, code, category): data = { 'courtId': code, 'pagelink': 'Next', 'lastCaseProcessed': '', 'firstCaseProcessed': '', 'category': category, 'firstCaseSerialNumber': 0, 'lastCaseSerialNumber': 0, 'searchType': '', 'emptyList': '' } data = urllib.urlencode(data) url = self.url('hearSearch.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser')
def __init__(self): self.opener = Opener()
class DistrictCourtOpener: url_root = 'https://eapps.courts.state.va.us/gdcourts/' def __init__(self): self.opener = Opener() def url(self, url): return DistrictCourtOpener.url_root + url; def open_welcome_page(self): url = self.url('caseSearch.do?welcomePage=welcomePage') page = self.opener.open(url) page_content = page.read() soup = BeautifulSoup(page_content) # See if we need to solve a captcha if 'By clicking Accept' in page_content: captcha_url = self.url('captchaVerification.do') captcha.solve(self.opener, captcha_url) page = self.opener.open(url) soup = BeautifulSoup(page.read()) return soup def change_court(self, name, code): data = urllib.urlencode({ 'selectedCourtsName': name, 'selectedCourtsFipCode': code, 'sessionCourtsFipCode': '' }) url = self.url('changeCourt.do') self.opener.open(url, data) def open_hearing_date_search(self, code): url = self.url('caseSearch.do') url += '?fromSidebar=true&searchLanding=searchLanding' url += '&searchType=hearingDate&searchDivision=T&' url += '&searchFipsCode=' + code url += '&curentFipsCode=' + code self.opener.open(url) def do_hearing_date_search(self, code, date, first_page): data = { 'formAction':'', 'curentFipsCode':code, 'searchTerm':date, 'searchHearingTime':'', 'searchCourtroom':'', 'lastName':'', 'firstName':'', 'middleName':'', 'suffix':'', 'searchHearingType':'', 'searchUnitNumber':'', 'searchFipsCode':code } if first_page: data['caseSearch'] = 'Search' else: data['caseInfoScrollForward'] = 'Next' data['unCheckedCases'] = '' data = urllib.urlencode(data) url = self.url('caseSearch.do') page = self.opener.open(url, data) content = '' for line in page: if '<a href="caseSearch.do?formAction=caseDetails' in line: line = line.replace('/>', '>') content += line soup = BeautifulSoup(content) return soup def open_case_details(self, case): url = self.url(case['details_url']) page = self.opener.open(url) return BeautifulSoup(page.read())
class CircuitCourtOpener: url_root = 'http://ewsocis1.courts.state.va.us/CJISWeb/' def __init__(self): self.opener = Opener('circuit') def url(self, url): return CircuitCourtOpener.url_root + url def open_welcome_page(self): url = self.url('circuit.jsp') page = self.opener.open(url) return BeautifulSoup(page.read(), 'html.parser') def log_off(self): data = urllib.urlencode({'searchType': ''}) url = self.url('Logoff.do') self.opener.open(url, data) def change_court(self, code, court): data = urllib.urlencode({ 'courtId': code, 'courtType': 'C', 'caseType': 'ALL', 'testdos': False, 'sessionCreate': 'NEW', 'whichsystem': court }) url = self.url('MainMenu.do') self.opener.open(url, data) def do_case_number_search(self, code, case_number, category): data = { 'submitValue': '', 'courtId': code, 'caseNo': case_number, 'categorySelected': category } data = urllib.urlencode(data) url = self.url('CaseDetail.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def do_case_number_pleadings_search(self, code, case_number, category): data = { 'submitValue': 'P', 'courtId': code, 'categorySelected': category, 'caseStatus': 'A', 'caseNo': case_number } data = urllib.urlencode(data) url = self.url('CaseDetail.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def do_case_number_services_search(self, code, case_number, category): data = { 'submitValue': 'S', 'courtId': code, 'categorySelected': category, 'caseStatus': 'A', 'caseNo': case_number } data = urllib.urlencode(data) url = self.url('CaseDetail.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def return_to_main_menu(self, code): data = {'courtId': code} data = urllib.urlencode(data) url = self.url('MainMenu.do') self.opener.open(url, data) return def do_name_search(self, code, name, category): data = { 'category': category, 'lastName': name, 'courtId': code, 'submitValue': 'N' } data = urllib.urlencode(data) url = self.url('Search.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def continue_name_search(self, code, category): data = { 'courtId': code, 'pagelink': 'Next', 'lastCaseProcessed': '', 'firstCaseProcessed': '', 'lastNameProcessed': '', 'firstNameProcessed': '', 'category': category, 'firstCaseSerialNumber': 0, 'lastCaseSerialNumber': 0, 'searchType': '', 'emptyList': '' } data = urllib.urlencode(data) url = self.url('Search.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def do_date_search(self, code, date, category): data = { 'hearSelect': '', 'selectDate': date, 'categorySelected': category, 'hearDateSelected': date, 'submitValue': '', 'courtId': code } data = urllib.urlencode(data) url = self.url('hearSearch.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser') def continue_date_search(self, code, category): data = { 'courtId': code, 'pagelink': 'Next', 'lastCaseProcessed': '', 'firstCaseProcessed': '', 'category': category, 'firstCaseSerialNumber': 0, 'lastCaseSerialNumber': 0, 'searchType': '', 'emptyList': '' } data = urllib.urlencode(data) url = self.url('hearSearch.do') page = self.opener.open(url, data) return BeautifulSoup(page.read(), 'html.parser')
class Register_corp: def __init__(self): logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') self.model = CorpModel self.invalid_code_model = InvalidCodeModel self.max_code_model = MaxCodeModel self.info_from = '广东红盾网' self.charset = 'gbk' self.opener = Opener(encoding=self.charset) # 6位机关代码; 6位 self.org_code = '441900' # 企业性质号码; 2位, 00~30为内资;40~50为外资. self.nature_num = 0 # 流水号; 4位 self.ord_num= 1 self.corp_url = 'http://wsnj.gdgs.gov.cn/aiccps/SearchServlet?service=getEntityInfoByPage®isterNo=%s' self.search_text_reg = re.compile(r'^<table width="100%" border="0"><tr><td align=center height=200 >找不到相关的数据\.\.</td></tr></table>$') self.regs = [ re.compile(r'<td align=left width=100% colspan=6 height=25>(?P<name>[^<]+)', re.S), re.compile(r'址:</td><td align=left valign=top colspan=5>(?P<addr>[^<]*)', re.S), re.compile(r'号:</td><td align=left valign=top><font color="red">(?P<register_code>[^<]*)', re.S), re.compile(r'[名人]:</td><td align=left valign=top colspan=3>(?P<representative>[^<]*)', re.S), re.compile(r'型:</td><td align=left valign=top>(?P<nature>[^<]*)', re.S), re.compile(r'限:</td><td align=left valign=top colspan=3>(?P<period>[^<]*)', re.S), re.compile(r'本:</td><td align=left valign=top>(?P<capital>[^<]*)', re.S), re.compile(r'关:</td><td align=left valign=top colspan=3>(?P<register_department>[^<]*)', re.S), re.compile(r'态:</td><td align=left valign=top>(?P<status>[^<]*)', re.S), re.compile(r'期:</td><td align=left valign=top colspan=3>(?P<establishment_data>[^<]+)', re.S), re.compile(r'围:</td><td align=left valign=top colspan=5>(?P<scope>[^<]*)', re.S), ] self._save_times = 0 self._today = get_cst() def _msg(self, text): return '%s %s' % (time.asctime(), text) def _process(self, corp_dict): corp_dict['insert_date'] = self._today corp_dict['info_from'] = self.info_from if 'establishment_data' in corp_dict and corp_dict['establishment_data']: corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data']) return corp_dict def calc_code15(self, org_code, nature_num, ord_num): """ 计算15位注册号的检验码. 并返回15位注册号. """ code14 = '%s%02d%06d' % (org_code, nature_num, ord_num) temp = reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10) return '%s%s' % (code14, (11-temp)%10) def calc_code13(self, org_code, nature_num, ord_num): """ 返回13位注册号. """ return '%s%s%04d' % (org_code, nature_num, ord_num) def init_invalid_codes(self): i = 0 for query_obj in self.max_code_model.get_all(): org_code, nature_num, max_ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num for ord_num in range(0, max_ord_num): register_code = self.calc_code15(org_code, nature_num, ord_num) if self.model.exists_by(register_code=register_code): continue self.invalid_code_model.add({'register_code': register_code}, is_commit=False) i += 1 if not i%200: self.model.commit() print('Save 200 invalid codes!') self.model.commit() def fetch(self, code): print('###############################################################') print(self._msg('注册码: %s' % code)) url = self.corp_url % code content = self.opener.urlopen(url, timeout=10, times=0) #if content.find(self.search_text) < 0: if self.search_text_reg.match(content): print('没找到相关信息.') return [] result = [] temp = {} for search_obj in (reg.search(content) for reg in self.regs): if search_obj: temp.update(search_obj.groupdict()) print(temp) result.append(temp) if not result: logging.info('register code: %s' % code) return result def save(self, register_code): self._save_times = (self._save_times + 1) % 101 self._save_times or self.model.commit() corps = self.fetch(register_code) if not corps: return False for corp in corps: self.model.add(self._process(corp), is_commit=False) print('添加成功!') print('###############################################################') return True def action(self): invalid_times = 0 while self.nature_num<=99: if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)): invalid_times = 0 else: invalid_times += 1 self.ord_num += 1 if invalid_times >= 500: invalid_times = 0 self.ord_num = 0 if self.nature_num >=40: self.nature_num = 0 self.org_code = str(int(self.org_code)+1) if int(self.org_code)>=440200: break else: self.nature_num = 40 logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num)) self.model.commit() def action_test(self): pass def action_new(self, invalid_times=20): for query_obj in self.max_code_model.get_all(): org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1 times = 0 while times<invalid_times: if self.save(self.calc_code15(org_code, nature_num, ord_num)): times = 0 query_obj.ord_num = ord_num # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件. self._save_times or self.model.commit() else: times += 1 ord_num += 1 self.model.commit() def action_from_invalid_codes(self): for query_obj in self.invalid_code_model.get_all(): if self.save(query_obj.register_code): query_obj.delete() self.model.commit() def action_from_file(self): f = open('others.txt') for line in f: code = line[:-1] self.save(code) f.close() self.model.commit() def report(self): corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today()) #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,11,3)) rows = [] fields = ( ('名称', 'name'), ('注册码', 'register_code'), ('地址', 'addr'), ('经营范围', 'scope'), ('注册资金', 'capital'), ('成立日期', 'establishment_data'), ('企业性质', 'nature'), ('法人', 'representative'), ('企业状态', 'status'), ('期限', 'period'), ('登记单位', 'register_department'), ('信息来源', 'info_from'), ('更新日期', 'insert_date'), ) self.model.report('东莞红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')
class Corp(multiprocessing.Process): def __init__(self, corplist_url, corp_url, info_from, corplist_post_data=None, corp_post_data=None, corplist_reg=None, corp_regs=[], timeout=5, commit_each_times=30, has_cookie=True, charset='utf8', model=None): """ 参数 corplist_url 和 corp_url 取胜字符串的高级格式化:format, 使用{0},{1}等通配符; """ super().__init__() self.charset = charset self.info_from = info_from self.corplist_url = corplist_url self.corp_url = corp_url self.opener = Opener(has_cookie=has_cookie, encoding=self.charset) self.corplist_post_data = corplist_post_data self.corp_post_data = corp_post_data self.corplist_reg = corplist_reg self.corp_regs = corp_regs self.commit_each_times = commit_each_times self.timeout = timeout if model: self.model = model else: from lib.models import CorpModel self.model = CorpModel #self._today = get_cst() self._today = datetime.date.today() def _msg(self, msg=''): #print('%s %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)) logging.info(msg) def set_queue(self, queue): self.queue = queue def process_corp_info(self, corp_info, date_reg=r'(?P<year>\d+)-(?P<month>\d+)-(?P<day>\d+)'): for key, values in corp_info.items(): corp_info[key] = values.strip() if 'insert_date' in corp_info: corp_info['insert_date'] = self.model._str2date(corp_info['insert_date'], date_reg=date_reg) else: corp_info['insert_date'] = self._today return corp_info def get_next_page_url(self): """ 必须是一个非协程的Generator, 或者返回一个iterable. """ return (self.corplist_url) def get_corp_url(self, corp_info={}): return self.corp_url.format(**corp_info) def prepare(self): pass def fetch_corplist(self, page_url): """ 如果成功抓取, 返回一个包含 Corp Info dict 的列表或者iterable; 否则返回 {}. """ content = self.opener.urlopen(page_url, data=self.corplist_post_data, timeout=self.timeout, times=0) return ({} if not search_obj else search_obj.groupdict() for search_obj in self.corplist_reg.finditer(content)) def fetch_corp(self, corp_info=None): """ 如果成功抓取, 返回一个Corp Info 的 dict. """ corp_url = self.get_corp_url(corp_info) content = self.opener.urlopen(corp_url, data=self.corp_post_data, timeout=self.timeout, times=0) for reg in self.corp_regs: search_obj = reg.search(content) search_obj and corp_info.update(search_obj.groupdict()) return corp_info def before_save(self, corp_info): corp_info = self.process_corp_info(corp_info) corp_info['info_from'] = self.info_from return corp_info def commit(self): self.model.commit() @coroutine def check_exists(self): """ Generator, 存在的话返回其 info_from, 否则返回 None. """ corp_names_cache = {} corp_names_cache_list = [] cache_length = 0 result = None while 1: corp_info = (yield result) result = None corp_name = corp_info['name'].strip() if corp_name not in corp_names_cache: corp_names_cache[corp_name] = self.info_from corp_names_cache_list.insert(0,corp_name) cache_length += 1 if cache_length > self.commit_each_times: del corp_names_cache[corp_names_cache_list.pop()] cache_length -= 1 exists_corp = self.model.filter_by(name=corp_name).first() if exists_corp: result = exists_corp.info_from corp_names_cache[corp_name] = result else: result = corp_names_cache[corp_name] def run(self): self.prepare() check_exists = self.check_exists() cur_page = itertools.count() for page_url in self.get_next_page_url(): print('\n%s 第%s页' % (self.info_from, next(cur_page)+1)) for corp_info in self.fetch_corplist(page_url): self._msg('***************************************************') print(corp_info['name'], end=' ') info_from = check_exists.send(corp_info) if not info_from: if self.corp_regs: corp_info = self.fetch_corp(corp_info) corp_info = self.before_save(corp_info) self.queue.put(corp_info) print('保存成功!') else: print('已经存在于: %s' % info_from) self._msg('\n%s 抓取完毕!' % self.info_from) self.queue.put(None) def report(self, fields=None): corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date.today()) #corps = self.model.filter_by(info_from=self.info_from, insert_date=datetime.date(2011,12,8)) fields = fields or ( ('名称', 'name'), ('地址', 'addr'), ('联系人', 'contact_person'), ('区号', 'contact_tel_code'), ('电话号码', 'contact_tel_no'), ('邮箱', 'mail'), ('网址', 'website'), ('信息来源', 'info_from'), ('更新日期', 'insert_date'), ('链接', self.corp_url), ) self.model.report('%s最新公司信息_%s.csv' % (self.info_from, time.strftime('%Y-%m-%d')), fields=fields, rows=corps, encoder='gbk')
class Register_corp: def __init__(self): logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') self.model = CorpModel self.invalid_code_model = InvalidCodeModel self.max_code_model = MaxCodeModel self.info_from = '广东红盾网' self.charset = 'utf8' self.opener = Opener(has_cookie=True, encoding=self.charset) # 6位机关代码; 6位 self.org_code = '440101' # 企业性质号码; 2位, 00~30为内资;40~50为外资. self.nature_num = 0 # 流水号; 4位 self.ord_num= 0 self.query_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseopeninfo.aspx' self.corp_url = 'http://www.gzaic.gov.cn/gsbm/FrmRegBaseOpeninfoDetail.aspx?key={0}' self.search_string = '查看' self.reg = re.compile(r'lbSREGNO">(?P<register_code>[^<]*)</span></td>.*?lbSNAME">(?P<name>[^<]*)</span></td>.*?lbSDOM">(?P<addr>[^<]*)</span></td>.*?lbSMAN">(?P<representative>[^<]*)</span></td>.*?lbSENTCLASS">\s*(?P<nature>[^<]*)</span></td>.*?lbSREGCAP">(?P<capital>[^<]*)</span></td>.*?lbSSSZB">[^<]*</span></td>.*?lbSOPSCOPE">(?P<scope>[^<]*)</span></td>.*?LbSREGRDATE">(?P<establishment_data>[^<]*)</span></td>.*?lbSAPPRDATE">[^<]*</span></td>.*?lbSREGORG">(?P<register_department>[^<]*)</span></td>', re.S) self.event_regs = [ re.compile(r'__VIEWSTATE" value="(?P<__VIEWSTATE>[^"]*)'), re.compile(r'__EVENTVALIDATION" value="(?P<__EVENTVALIDATION>[^"]*)'), ] self.key_reg = re.compile(r'key=(?P<key>[^\']*)') self.post_data = { '__EVENTTARGET': 'QueryButton', '__EVENTARGUMENT': '', '__VIEWSTATE': '', '__EVENTVALIDATION': '', 'txtName': '', 'txtReg': '', } self._save_times = 0 self._today = get_cst() self.initial() # 初始化 ufps def initial(self): self.fetch_content(self.query_url) def _msg(self, text): return '%s %s' % (time.asctime(), text) def _process(self, corp_dict): corp_dict.update({ 'insert_date': self._today, 'info_from': self.info_from, 'status': '登记成立', 'period': '长期', }) if 'establishment_data' in corp_dict and corp_dict['establishment_data']: corp_dict['establishment_data'] = self.model._str2date(corp_dict['establishment_data']) return corp_dict def calc_code15(self, org_code, nature_num, ord_num): """ 计算15位注册号的检验码. 并返回15位注册号. """ code14 = '%s%02d%06d' % (org_code, nature_num, ord_num) temp = reduce(lambda x,y: ((x+int(y))%10 or 10)*2%11, code14, 10) return '%s%s' % (code14, (11-temp)%10) def update_events(self, content): for reg in self.event_regs: search_obj = reg.search(content) search_obj and self.post_data.update(search_obj.groupdict()) def fetch_content(self, url, data=None, timeout=10, raw_string=False, update_events=True): content = self.opener.urlopen(url, data, timeout=timeout, times=0) update_events and self.update_events(content) return content def fetch_query_content(self, code): self.post_data.update({ '__EVENTTARGET': 'QueryButton', 'txtReg': code, }) return self.fetch_content(self.query_url, self.post_data) def fetch_key_content(self): self.post_data['__EVENTTARGET'] = 'GridView1$ctl02$LinkButton1' return self.fetch_content(self.query_url, self.post_data) def fetch_corp_key(self, code): content = self.fetch_query_content(code) if content.find(self.search_string) < 0: return None content = self.fetch_key_content() search_obj = self.key_reg.search(content) return search_obj and search_obj.groups()[0] def fetch_corp_info(self, code): print('###############################################################') print(self._msg('注册码: %s' % code)) corp_key = self.fetch_corp_key(code) if corp_key == None: print('没找到相关信息.') return [] print('key: %s' % corp_key) url = self.corp_url.format(corp_key) content = self.fetch_content(url, update_events=False) result = [] search_obj = self.reg.search(content) if search_obj: search_obj.groupdict() and result.append(search_obj.groupdict()) print(search_obj.groupdict()) if not result: logging.info('register code: %s' % code) return result def save(self, register_code): self._save_times = (self._save_times + 1) % 101 self._save_times or self.model.commit() corps = self.fetch_corp_info(register_code) if not corps: return False for corp in corps: self.model.add(self._process(corp), is_commit=False) print('添加成功!') print('###############################################################') return True def action(self): invalid_times = 0 while self.nature_num<=99: if self.save(self.calc_code15(self.org_code, self.nature_num, self.ord_num)): invalid_times = 0 else: invalid_times += 1 self.ord_num += 1 if invalid_times >= 500: invalid_times = 0 self.ord_num = 0 if self.nature_num >=40: self.nature_num = 0 self.org_code = str(int(self.org_code)+1) if int(self.org_code)>=440200: break else: self.nature_num = 40 logging.info('org_code: %s nature code: %s' % (self.org_code, self.nature_num)) self.model.commit() def action_test(self): pass def action_new(self, invalid_times=10): for query_obj in self.max_code_model.get_all(): org_code, nature_num, ord_num = query_obj.org_code, query_obj.nature_num, query_obj.ord_num+1 times = 0 while times<invalid_times: if self.save(self.calc_code15(org_code, nature_num, ord_num)): times = 0 query_obj.ord_num = ord_num # 预防 query_obj.ord_num = ord_num 发生特殊的未commit事件. self._save_times or self.model.commit() else: times += 1 ord_num += 1 self.model.commit() def action_from_invalid_codes(self): for query_obj in self.invalid_code_model.get_all(): if self.save(query_obj.register_code): query_obj.delete() self.model.commit() def action_from_file(self): f = open('others.txt') for line in f: code = line[:-1] self.save(code) f.close() self.model.commit() def report(self): corps = self.model.filter_by(status='登记成立', insert_date=datetime.date.today()) #corps = self.model.filter_by(status='登记成立', insert_date=datetime.date(2011,12,8)) rows = [] fields = ( ('名称', 'name'), ('注册码', 'register_code'), ('地址', 'addr'), ('经营范围', 'scope'), ('注册资金', 'capital'), ('成立日期', 'establishment_data'), ('企业性质', 'nature'), ('法人', 'representative'), ('企业状态', 'status'), ('期限', 'period'), ('登记单位', 'register_department'), ('信息来源', 'info_from'), ('更新日期', 'insert_date'), ) self.model.report('广州红盾网最新注册公司信息_%s.csv' % time.strftime('%Y-%m-%d'), fields=fields, rows=corps, encoder='gbk')