コード例 #1
0
ファイル: extLib.py プロジェクト: jhfnetboy/proxyCrawler
def enterprise_record_raw_1_function(conn, start, end):
    '''
    规则:看字段转换表内内容
    把enterprise_record_raw记录转换转入自己表中的其他字段(轻度数据提取)
    '''
    tableName = 'enterprise_record_raw_1'
    cursor = conn.cursor()
    cursor2 = conn.cursor()
    qSql = "select id, eid, enterprise_name, content from " + tableName + ' where id >='+str(start)+' and id <='+str(end)
    cursor.execute(qSql)
    print  "start extract data in "+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    count = 0
    while 1:
        record = cursor.fetchone()
        if not record or record[0]>end:
            continue      
        count += 1
        id = record[0]
        eid = record[1]
        content = record[3].strip()
#        print id
#        print eid
        clist = content.split('r_obj = t_obj.add_record(true);\n')
       
        print '企业名称:'+ record[2]+', eid:'+str(eid)+', id:'+str(id)
        tmp = clist[0].split(':')
        publisher = tmp[0].split(',')[-1].strip()
        print publisher
        try:
            category = tmp[1].split(',')[0].strip()
            print category
        except:
            continue
        if len(clist)<2:
            recs = clist[0]
        else:
            recs = ''.join(clist[1].split(', false);'))
        tmp = recs.split('\n')
        for i in range(len(tmp)):
            tt = tmp[i].split(',')
            del tt[0]
            tmp[i] = ''.join(tt)
        records = ';'.join(tmp)
        records = jTool.clearX(['(', ')', 'true'], records).strip()
        print records
        print '*'*30
        tmpDic = {}
        tmpDic['publisher'] = publisher
        tmpDic['category'] = category
        tmpDic['records'] = records
        where = ' where id = '+str(id)
        jTool.updateData(cursor2, where, tableName, tmpDic)
        tmpDic = {}
        conn.commit()
    print  "complete extract "+str(count)+" records in "+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    cursor2.close()
    cursor.close()
    conn.close()
    return True
コード例 #2
0
ファイル: getMore.py プロジェクト: jhfnetboy/proxyCrawler
def mainLoop(start, end):
    proxyList = jTool.getProxy('proxy.txt')
    pcount = len(proxyList)-1
    head = ['Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3',
                    'Accept-Encoding:gzip,deflate,sdch',
                    'Accept-Language:zh-CN,zh;q=0.8',
                    'Cache-Control:max-age=0',
                    'Connection:keep-alive',
                    'Cookie:ASP.NET_SessionId=t3isah45gu5kb4454qyxkhzy; lzstat_uv=6061202253430616218|2529639; lzstat_ss=953382219_1_1373621147_2529639; _gscu_374314293=7359234405sddy11; _gscs_374314293=73592344d74zfy11|pv:3; _gscbrs_374314293=1; ECStaticSession=ECS81',
                    'Host:www.zjcredit.gov.cn:8000',
                    'Pragma:no-cache',
                    'Cookie:_gscu_374314293=73631708ff8h1y17; lzstat_uv=106813037832225946|2529639; ECStaticSession=ECS80; ASP.NET_SessionId=5dhxxl45gr4d0aexnf1uiu55; _gscbrs_374314293=1; lzstat_ss=815622537_1_1374448570_2529639; _gscs_374314293=t74419759zee6a318|pv:2',
                    'Origin:http://www.zjcredit.gov.cn:8000',
                    'Referer:http://www.zjcredit.gov.cn:8000/ListPrompts.aspx?sectionID=01&tableID=CourtNotCarryOut&associateID=00000000000000000&hasPromptHistroy=False',
                    'User-Agent:Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'
                    ]
    conn = jTool.initCursor('localhost', 'root', 'root', 'rawData')
    cursor = conn.cursor()
    cursor2 = conn.cursor()
    while start<=end:
        sql = 'select *  from base_page_list where id = '+str(start)+' and status = 0 limit 1'
        cursor.execute(sql)
        record = cursor.fetchone()  
        if not record and start<=end:
            start += 1
            continue
        
        corpName = record[1]
        rowID = record[3]
        print corpName+', '+str(start)
        rt = None
        count = 1
        while not rt and count<=2:
#            print count
            proxy = str(proxyList[random.randint(0, pcount)]).strip()
#            print proxy
            rt = getPageField(conn, proxy, head, str(start), rowID, corpName)
#            print rt
            count += 1
            if rt:
                print 'id'+str(start)+' ok'
                jTool.updateData(cursor2, ' where id = '+str(start)+' ', 'base_page_list', {'status': '1'})
                continue
        start += 1
        conn.commit()
    cursor.close()
    cursor2.close()
    conn.close()
コード例 #3
0
ファイル: rp.py プロジェクト: jhfnetboy/proxyCrawler
def crawlGetUrl(conn, param, proxy):
    '''
    以get方式获得企业基本信息并添加到enterprise_raw表
    '''
    cursor = conn.cursor()
    cursor2 = conn.cursor()
    cursor4 = conn.cursor()
    recordExists = jTool.notExsitsRecord(cursor4, 'enterprise_raw', 'eid', param['id'])
    cursor4.close()
    if recordExists:
#        print param['ename']+'基本信息已存在,eid'+str(param['eid'])
        jTool.updateData(cursor, ' where id = '+str(param['iid'])+' limit 1', 'item_url_task', {'status': '1'})
        cursor.close()
        conn.commit()
        return {'logic': True, 'rtData': recordExists}
    if not recordExists:
#        print 'get url:'+param['url']+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy
        content = jTool.getContentByProxy(proxy, param['url'])
        if not content:
            print '获取基本信息页面内容为空或失败'
            return {'logic': False, 'rtData': {'type': 'getPageError', 'postPage': ' '}}
        recEntBaseDic = None
        if content:
            try:
                recEntBaseDic = getEntBase(content)
            except Exception, e:
                print '解析企业基本信息页面失败', __name__, e
                return {'logic': False, 'rtData': {'type': 'parsePageBaseError', 'postPage': ' '}}
        if not recEntBaseDic:
            return {'logic': False, 'rtData': {'type': 'parsePageBaseError', 'postPage': ' '}}
        recEntBaseDic['url'] = param['url']
        recEntBaseDic['eid'] = param['id']
        recEntBaseDic['postContent'] = ' '
        recEntBaseDic['enterprise_name'] = param['ename'].strip('')
        recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        insertId = None
        try:
            jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic)
            insertId = conn.insert_id()
            conn.commit()
            print str(param['id'])+'    Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            jTool.updateData(cursor, ' where id = '+str(param['iid'])+' limit 1', 'item_url_task', {'status': '1'})
            return {'logic': True, 'rtData': insertId}
        except Exception, e:
            print 'Fail to insert baseinfo record '+ ', id is '+str(param['id'])+', proxy:'+proxy, __name__, e
            print recEntBaseDic
            return {'logic': False, 'rtData': {'type': 'insertPageError', 'postPage': ' '}}
コード例 #4
0
ファイル: rp.py プロジェクト: jhfnetboy/proxyCrawler
#    postContent =cursor.fetchone()
#    cursor.close()
#    try:
#        if len(postContent[0])>10:
##            print '企业细节记录已存在,跳过'
#            return {'logic': True}
#    except:
#        pass
#    print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id']+', proxy:'+proxy
    try:
        contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic)
    except Exception, e:
        print '页面未包含详细记录或获取失败', __name__, e
        return {'logic': False, 'rtData': {'type': 'postContentError', 'postContent': ' '}}
    try:
        contentRecord = cutContent(contentRecord)
        print '    Get post content OK'
        if len(contentRecord)>10:
            jTool.updateData(cursor1, ' where id = '+str(param['iid']), 'enterprise_raw', {'status': '2','postContent': str(contentRecord).decode('utf-8', 'ignore')})
            conn.commit()
            cursor1.close()
            print '    Save postContent OK'
            return {'logic': True}
        else:
            print 'POST得到页面非企业详细记录页面'
            return {'logic': False, 'rtData': {'type': 'parsepostContentError', 'postContent': contentRecord}}
    except Exception, e:
        print '获取页面详细记录失败', __name__, e
        return {'logic': False, 'rtData': {'type': 'parsepostContentError', 'postContent': contentRecord}}
    
コード例 #5
0
ファイル: rp.py プロジェクト: jhfnetboy/proxyCrawler
def crawlUrl(conn, param, proxy):
    '''
    访问url并获得企业基本信息和信用记录并入库
    '''
    cursor2 = conn.cursor()
    cursor4 = conn.cursor()
    cursor5 = conn.cursor()
    cursor6 = conn.cursor()
    recordExists = jTool.notExsitsRecord(cursor4, 'enterprise_raw', 'eid', param['id'])
    cursor4.close()
    if recordExists:
        print '企业'+param['ename']+'基本信息已存在,直接尝试获取记录信息 \npage url:'+param['url']
    if not recordExists:
        print 'get url:'+param['url']+', '+param['ename'].strip()+',id:'+param['id']
        content = jTool.getContentByProxy(proxy, param['url'])
#        jTool.logit(str(content), 'errorPage.txt')
        if not content:
            print '获取基本信息页面内容为空或失败'
            jTool.logError('\nGet page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url'])
            return False

        recEntBaseDic = None
        if content:
            try:
                recEntBaseDic = getEntBase(content)
            except:
                print '解析企业基本信息页面失败'
                jTool.logError('\nParse page content fail, method:get, ename:'+param['ename']+',id:'+param['id']+',url:'+ param['url'])
                return False
        if not recEntBaseDic:
            return False
        recEntBaseDic['url'] = param['url']
        recEntBaseDic['eid'] = param['id']
        recEntBaseDic['postContent'] = ' '
        recEntBaseDic['enterprise_name'] = param['ename'].strip('')
        recEntBaseDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

        try:
            jTool.insertData(cursor2, 'enterprise_raw', recEntBaseDic)
            conn.commit()
            print '    Insert enterprise baseinfo successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        except:
            print 'Fail to insert record '+ ', id is '+str(param['id'])
            jTool.logError('Fail to insert record on '+ 'url is '+param['url']+', id is '+str(id))
            return False

    dataSupplier = "法院记录/工商记录/国税记录/质监记录/经信记录/安监记录/统计记录/环保记录/民政记录/司法记录/劳动记录/建设记录/国土记录/交通记录/发改记录/信息产业/科技记录/农业记录/林业记录/海洋渔业/物价记录/食品药品/文化记录/出版记录/广电记录/公安记录/外贸记录/外汇记录/海关记录/检验检疫/人防记录/证监记录/银监记录/保监记录/金融记录/其他记录/行业协会/机构评级/社会中介/阿里巴巴/企业自报/投诉记录/异议记录"
    post_data_dic = {'corpName': param['ename'], 'creditID': param['id'], 'dataSupplier': dataSupplier, 'isAllInfo': 'False', 'organizeCode': '', 'returnFunction': 'parent.putDatasAndLoad'}
    recEntDetailDic = {}
    recEntDetailDic['url'] = param['url']
    recEntDetailDic['eid'] = param['id']
    recEntDetailDic['enterprise_name'] = param['ename'].strip()
    recEntDetailDic['records'] = ' '
    recEntDetailDic['ctime'] = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    contentRecord = None
    jTool.getField(cursor6, 'enterprise_raw', 'postContent', ' where eid = '+str(param['id']))
    postContent =cursor6.fetchone()
    cursor6.close()
    if len(postContent[0])>10:
        print '企业细节记录已存在,跳过'
        return True
    print 'post url:'+str(param['basePostUrlip'])+', '+param['ename'].strip()+',id:'+param['id']
    try:
        contentRecord = jTool.getContentByProxy(proxy, str(param['basePostUrlip']), post_data_dic)
    except:
        print '页面未包含详细记录或获取失败'
        jTool.logError('\nPost get None, ename:'+param['ename'].strip()+',id:'+param['id']+',url:'+ param['url'])
        return True
    try:
        contentRecord = cutContent(contentRecord)
        print 'Get post content OK'
        jTool.logit(str(contentRecord), 'errorContent.log')
        if contentRecord:
            jTool.updateData(cursor5, ' where eid = '+param['id'], 'enterprise_raw', {'postContent': str(contentRecord).decode('utf-8', 'ignore')})
            cursor5.close()
        else:
            print 'POST得到页面非企业详细记录页面'
            jTool.logError('\nPage content error, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy)
            return True
    except:
        print '解析页面详细记录失败'
        jTool.logError('\nParse page content fail, method:post, ename:'+param['ename']+', id:'+param['id']+',url:'+ param['url']+', proxy:'+proxy)
        return False
    print 'Fetch and insert successfully in ' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    return True