Example #1
0
def doTask(eid, url, ename, param):
    '''
    循环6次尝试获取指定url的内容
    然后存储到远程数据库
    返回True,否则False
    '''
    param['eid'] = str(eid)
    param['ename'] = ename
#    param['url'] = param['preUrl'] + url
    param['url'] = param['preUrlip'] + url
    proxyList = jTool.getProxy('proxy.txt')
    return rp.crawlUrl(param['conn'], param, proxyList)
Example #2
0
def operLog(logFileName):
    file = open(logFileName)
    line = file.readline()
    param = makeParam()
    param['taskTable'] = 'item_url_task'
    param['num'] = '999'
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    proxy = proxyList[random.randint(0, pcount)]
    while line:
        if line[0]=='G':
            tmp = line.split(',')
            for t in tmp:
                tt = t.split(':')
                if len(tt)>1:
                    param[tt[0].strip()] = tt[1]
                    if len(tt)>2:
                        param[tt[0]] = tt[1]+':'+tt[2]+':'+tt[3].strip('\n')
            param['id'] = param['eid']
            rp.crawlUrl(param['conn'], param, proxy)
        if line[0]=='P':
#            print line
            pass
        line = file.readline()
Example #3
0
def doTask(task, param, proxy):
    param['id'] = str(task[0])
    param['url'] = param['preUrlip'] + task[1]
    param['ename'] = task[2]
#    try:
    return rp.crawlUrl(param['conn'], param, proxy)
Example #4
0
def makeParam():
    paramDic = {}
    paramDic['station'] = '信用浙江'
    paramDic['begin_url'] = 'http://www.zjcredit.gov.cn:8000/CreditQuery.aspx?sectionID=02'
    paramDic['query_url'] = 'http://www.zjcredit.gov.cn:8000/ListQuery.aspx'
    paramDic['post_data_dic'] = {'isIntermediary': 'False', 'isOpen': 'False', 'pageLength': '20', 'recordTotal': '1778190', 'sectionID': '02', 'sortDirection': '1', 'sortField': 'CreditID'}
    paramDic['preUrl'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID='
    paramDic['preUrlip'] = 'http://218.108.28.28:8000/EnterpriseInfo.aspx?creditID='
    paramDic['basePostUrl'] = 'http://www.zjcredit.gov.cn:8000/GetInfoByDataSupplier.aspx'
    paramDic['basePostUrlip'] = 'http://218.108.28.28:8000/GetInfoByDataSupplier.aspx'
    paramDic['dbHost'] = 'localhost'
    paramDic['dbUser'] = '******'
    paramDic['dbPasswd'] = 'root'
    paramDic['rdb'] = 'rawData'
    conn = jTool.initCursor(paramDic['dbHost'], paramDic['dbUser'], paramDic['dbPasswd'], paramDic['rdb'])
    paramDic['conn'] = conn
    return paramDic

param = makeParam()
param['taskTable'] = 'item_url_task'
param['num'] = '999'
proxyList = jTool.getProxy('proxy.txt')
pcount = len(proxyList)-1
proxy = proxyList[random.randint(0, pcount)]

param['url'] = 'http://www.zjcredit.gov.cn:8000/EnterpriseInfo.aspx?creditID=F651B7F17FEEDA7A'
param['eid'] = '69999'
param['id'] = '69999'
param['ename'] = '湖州市邮政局千金邮电所'
rp.crawlUrl(param['conn'], param, proxy)