Example #1
0
def curl_google(flog,url,cnt=3,timeout=20):
    '''爬取给定产品的Google评论 TODO: 记录失败'''
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
             'Cookie':'GDSESS=ID=cf8ac8be49f8acdb:TM=1334745971:C=c:IP=59.64.138.143-:S=ADSvE-eqbAFWgJvEN9jwBqGchL-v7B-QPQ; PREF=ID=a4da58a0b2f23f10:NW=1:TM=1334745976:LM=1334745977:S=IxLhWtuXtWNUg034'
             }
    req = urllib2.Request(url,headers=headers)
    try:
        html = urllib2.urlopen(req,timeout=timeout).read()    # 超时Raise time-out ERROR
    except Exception,data:
        print "!!!! ERROR::curl_google( %s ) sleep(5) \t"%url,data        
        if cnt!=0:
            sleep(5)
            return curl_google(flog,url,cnt-1)  # 递归调用;最多尝试cnt次
        else:
            log_info(flog,'curl_google',data,{'url':url},ctime())
            return ""
Example #2
0
def crawler_one(path_log,psave,pre_url):
    '''爬取一个产品的所有评论,每次10个,一行存一个,列分割符为@@@@'''
    f = file(path_log,'ab')
    url = ''.join([pre_url,'0'])     # 默认第一个从
    html = curl_google(f,pre_url)
    codes = get_code(html)
    sz = get_size(html,codes)        #; print codes,sz; return html;
    res = get_comm(html,codes)
    emptyF = 0                       # 记录连续为空的网页数,达到阀值则终止抓取
    # 建立缓存文件,文件名与结果同前缀,后面添加.bak
    sh = shelve.open(''.join([psave,'.bak']),writeback=True)
    tbpos = url.find('q=') ; tepos = url[tbpos:].find('&')+tbpos
    keys=url[tbpos:tepos]
    sh[keys]=[res]
    sh.sync()
    sleep(1)
    sf = file(psave,'ab')            # 保存结果
    sf.write( '\n'.join(res) )
    sf.flush() 
    for i in range(0,sz,10):        # google每页返回10个
        url = ''.join([pre_url,str(i)])
        html = curl_google(f,url)    # ;return html;
#        codes = get_code(html)
        res = get_comm(html,codes)   # 默认使用同一编码
        print "!!!! INFO::crawler_one(%s) html-sz:%d, comment-list-sz:%d, total=%d !!!!"%(url,len(html),len(res),sz)
        if res==[]:
            emptyF += 1
            if emptyF <5: continue
            else:
                log_info(f,'crawler_one()','get continues empty HTML',{'emptyF':emptyF},ctime())
                break               # 终止正式抓取
        sf.write( '\n'.join(res) )
        sf.flush()
        sh[keys].append(res)
        sh.sync() 
        sleep(3)                    # wait for flush()
    sf.close()
    sh.close()