def makeParam(table): paramDic = {} paramDic["station"] = "信用浙江" paramDic["begin_url"] = "http://www.zjcredit.gov.cn:8000/CreditQuery.aspx?sectionID=02" paramDic["query_url"] = "http://www.zjcredit.gov.cn:8000/ListQuery.aspx" paramDic["post_data_dic"] = { "isIntermediary": "False", "isOpen": "False", "pageLength": "20", "recordTotal": "1778190", "sectionID": "02", "sortDirection": "1", "sortField": "CreditID", } paramDic["preUrl"] = "http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=" paramDic["preUrlip"] = "http://218.108.28.28:8000/EnterpriseInfo.aspx?creditID=" paramDic["basePostUrl"] = "http://www.zjcredit.gov.cn:8000/GetInfoByDataSupplier.aspx" paramDic["basePostUrlip"] = "http://218.108.28.28:8000/GetInfoByDataSupplier.aspx" paramDic["dbHost"] = "localhost" paramDic["dbUser"] = "******" paramDic["dbPasswd"] = "root" paramDic["rdb"] = "rawData" conn = jTool.initCursor(paramDic["dbHost"], paramDic["dbUser"], paramDic["dbPasswd"], paramDic["rdb"]) paramDic["conn"] = conn return paramDic
def getAllpageList(start, end): conn = jTool.initCursor('localhost', 'root', 'root', 'rawData') while start<=end: pageNo = start getDetailListPages(pageNo, conn) print 'page ok:'+str(start) start += 1
def mainLoop(start, end): proxyList = jTool.getProxy('proxy.txt') pcount = len(proxyList)-1 head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding:gzip,deflate,sdch', 'Accept-Language:zh-CN,zh;q=0.8', 'Cache-Control:max-age=0', 'Connection:keep-alive', 'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81', 'Host:www.zjcredit.gov.cn:8000', 'Pragma:no-cache', 'Cookie:_gscu_374314293=73631708ff8h1y17; lzstat_uv=106813037832225946|2529639; ECStaticSession=ECS80; ASP.NET_SessionId=5dhxxl45gr4d0aexnf1uiu55; _gscbrs_374314293=1; lzstat_ss=815622537_1_1374448570_2529639; _gscs_374314293=t74419759zee6a318|pv:2', 'Origin:http://www.zjcredit.gov.cn:8000', 'Referer:http://www.zjcredit.gov.cn:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False', 'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36' ] conn = jTool.initCursor('localhost', 'root', 'root', 'rawData') cursor = conn.cursor() cursor2 = conn.cursor() while start<=end: sql = 'select * from base_page_list where id = '+str(start)+' and status = 0 limit 1' cursor.execute(sql) record = cursor.fetchone() if not record and start<=end: start += 1 continue corpName = record[1] rowID = record[3] print corpName+', '+str(start) rt = None count = 1 while not rt and count<=2: # print count proxy = str(proxyList[random.randint(0, pcount)]).strip() # print proxy rt = getPageField(conn, proxy, head, str(start), rowID, corpName) # print rt count += 1 if rt: print 'id'+str(start)+' ok' jTool.updateData(cursor2, ' where id = '+str(start)+' ', 'base_page_list', {'status': '1'}) continue start += 1 conn.commit() cursor.close() cursor2.close() conn.close()
def makeParam(): paramDic = {} paramDic['station'] = '信用浙江' paramDic['begin_url'] = 'http://www.zjcredit.gov.cn:8000/CreditQuery.aspx?sectionID=02' paramDic['query_url'] = 'http://www.zjcredit.gov.cn:8000/ListQuery.aspx' paramDic['post_data_dic'] = {'isIntermediary': 'False', 'isOpen': 'False', 'pageLength': '20', 'recordTotal': '1778190', 'sectionID': '02', 'sortDirection': '1', 'sortField': 'CreditID'} paramDic['preUrl'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=' paramDic['preUrlip'] = 'http://218.108.28.28:8000/EnterpriseInfo.aspx?creditID=' paramDic['basePostUrl'] = 'http://www.zjcredit.gov.cn:8000/GetInfoByDataSupplier.aspx' paramDic['basePostUrlip'] = 'http://218.108.28.28:8000/GetInfoByDataSupplier.aspx' paramDic['dbHost'] = 'localhost' paramDic['dbUser'] = '******' paramDic['dbPasswd'] = 'root' paramDic['rdb'] = 'rawData' conn = jTool.initCursor(paramDic['dbHost'], paramDic['dbUser'], paramDic['dbPasswd'], paramDic['rdb']) paramDic['conn'] = conn return paramDic
def initMyCursor(db): host = 'localhost' user = '******' passwd = 'root' conn = jTool.initCursor(host, user, passwd, db) return conn
#!/usr/bin #encoding=utf-8 import sys import jTool import extLib reload(sys) sys.setdefaultencoding("utf-8") #对enterprise_record_raw enterprise_raw的相关字段操作,提取,清洗,转换数据 #数据会还在对应表保存,完善后可以添加数据转换功能,替代php脚本,完成字段转换和默认值填充 def enterprise_record_raw_function(conn, start, end): print 'hi' if __name__=='__main__': print '*'*50 print 'Run like this : python ext2.py enterprise_raw 1 10' print 'now accept enterprise_raw,enterprise_record_raw' print '*'*50 conn = jTool.initCursor('localhost', 'root', 'root', 'rawData') table = sys.argv[1] start = sys.argv[2] end = sys.argv[3] print 'begin to extract and clean data from table ' + str(table) output_function = getattr(extLib, table+'_function') output_function(conn, start, end)
def makeParam(): paramDic = jTool.getConfigParam(['hostSvr', 'dbHost', 'dbUser', 'dbPasswd', 'rdb', 'basePostUrl', 'preUrl', 'basePostUrlip', 'preUrlip'], 'config.ini') conn = jTool.initCursor(paramDic['dbHost'], paramDic['dbUser'], paramDic['dbPasswd'], paramDic['rdb']) paramDic['conn'] = conn return paramDic
# -*-coding: utf-8 -*- #encoding=utf-8 import sys import jTool reload(sys) sys.setdefaultencoding("utf-8") #初始化任务表用,每个任务跑一次就可以 paramDic = jTool.getConfigParam(['host', 'user', 'passwd', 'db', 'initTable'], 'server.ini') conn = jTool.initCursor(paramDic['host'], paramDic['user'], paramDic['passwd'], paramDic['db']) jTool.initTaskTable(conn, paramDic['initTable'])
def __init__(self): paramDic = jTool.getConfigParam(['host', 'user', 'passwd', 'db'], 'server.ini') conn = jTool.initCursor(paramDic['host'], paramDic['user'], paramDic['passwd'], paramDic['db']) self.conn = conn