def curl_google(flog,url,cnt=3,timeout=20): '''爬取给定产品的Google评论 TODO: 记录失败''' headers={'User-Agent':'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19', 'Cookie':'GDSESS=ID=cf8ac8be49f8acdb:TM=1334745971:C=c:IP=59.64.138.143-:S=ADSvE-eqbAFWgJvEN9jwBqGchL-v7B-QPQ; PREF=ID=a4da58a0b2f23f10:NW=1:TM=1334745976:LM=1334745977:S=IxLhWtuXtWNUg034' } req = urllib2.Request(url,headers=headers) try: html = urllib2.urlopen(req,timeout=timeout).read() # 超时Raise time-out ERROR except Exception,data: print "!!!! ERROR::curl_google( %s ) sleep(5) \t"%url,data if cnt!=0: sleep(5) return curl_google(flog,url,cnt-1) # 递归调用;最多尝试cnt次 else: log_info(flog,'curl_google',data,{'url':url},ctime()) return ""
def crawler_one(path_log,psave,pre_url): '''爬取一个产品的所有评论,每次10个,一行存一个,列分割符为@@@@''' f = file(path_log,'ab') url = ''.join([pre_url,'0']) # 默认第一个从 html = curl_google(f,pre_url) codes = get_code(html) sz = get_size(html,codes) #; print codes,sz; return html; res = get_comm(html,codes) emptyF = 0 # 记录连续为空的网页数,达到阀值则终止抓取 # 建立缓存文件,文件名与结果同前缀,后面添加.bak sh = shelve.open(''.join([psave,'.bak']),writeback=True) tbpos = url.find('q=') ; tepos = url[tbpos:].find('&')+tbpos keys=url[tbpos:tepos] sh[keys]=[res] sh.sync() sleep(1) sf = file(psave,'ab') # 保存结果 sf.write( '\n'.join(res) ) sf.flush() for i in range(0,sz,10): # google每页返回10个 url = ''.join([pre_url,str(i)]) html = curl_google(f,url) # ;return html; # codes = get_code(html) res = get_comm(html,codes) # 默认使用同一编码 print "!!!! INFO::crawler_one(%s) html-sz:%d, comment-list-sz:%d, total=%d !!!!"%(url,len(html),len(res),sz) if res==[]: emptyF += 1 if emptyF <5: continue else: log_info(f,'crawler_one()','get continues empty HTML',{'emptyF':emptyF},ctime()) break # 终止正式抓取 sf.write( '\n'.join(res) ) sf.flush() sh[keys].append(res) sh.sync() sleep(3) # wait for flush() sf.close() sh.close()