def parse_body(self, response): print 'done' print 'parse_body: create dir-file,write in...' print 'parse_body: response: ', response item = response.meta['item'] ##encode encode_0 = response.xpath('/html/head') encode_1 = encode_0.xpath('.//meta') encode_3 = '' for en in encode_1: encode_2 = en.xpath('@content').extract() ##encode_2 = str(encode_2) if len(encode_2) != 0: encode_2 = encode_2[0] if encode_2.find('charset') != -1: encode_3 = encode_2.encode('utf-8') encode_3 = encode_3.strip('text').strip('/').strip('html').strip( '; ').strip('charset=') print 'parse_body: head : encode_3 :', encode_3 item['encode'] = encode_3 #建立文件路径 old_path = '/data/news_data/baidu_news/' t = time.localtime() t = time.strftime('%Y_%m_%d', t) new_path = os.path.join(old_path, t) item['path'] = new_path if not os.path.isdir(new_path): os.mkdir(new_path) ##item['path'] = ''.join( new_path ) #建立文件名 file = '' hash = 0 bodys = response.xpath('//body') title = bodys.xpath( './/h1/text() | ../*[contains(@*,\'titl\')]').extract() ##time_release_t = bodys.xpath(' .//*[ contains(@*,\'time\') ]/text() ' ).extract() time_release_t = bodys.xpath( ' .//*[ contains(@*,\'time\') ]/text() | //*[@class=\'conText\']/div[@class="summaryNew"]/text() | //div[@class="left-time"]/div[@class="left-t"]/text() | //*[@id=\'k_left\']/div[2]/div/p/span[2]/text() | //*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[5] | /html/body/div[8]/div[1]/div[4]/text() | //*[@id=\'pubtime_baidu\']/text() ' ).extract() ##title=response.xpath('//*[@class=\'conText\']/h1/text() | //div[@class="content"]/h1/text() | //div[@class="main"]/h1/text() | //*[@id=\'C-Main-Article-QQ\']/div[1]/h1 | /html/body/div[8]/div[1]/div[2]/text() | html/body/div[10]/div[1]/div[1]/text() ').extract() print 'parse_body: title: ', ''.join(title).encode('utf-8') if (len(title) != 0 and len(''.join(title)) > 3): ##title_a=''.join(title) title_a = ''.join(title[0]).encode('utf-8') ##title_a=str( ( title[0] ) ).encode('utf-8') sha1obj = hashlib.sha1() sha1obj.update(title_a) hash = sha1obj.hexdigest() print 'hash: ', hash file = new_path + '/' + hash ##hash 值作为文件名 item['hash'] = [hash] ##检查文件是否在已知路径中存在 path = '/data/news_data' pl = crawl_body.file_check(path, hash) ##pl=file_check(path,hash) ##开始写一个文本 if pl == 0: ##打开文件 print '文件写入开始。hash: ', hash ##fp = open(file,'w') ##抓取、写入正文标题 ##获取标题、取hash值 ##print 'title: ',''.join(title[0]).encode('utf-8') ##写入标题 ##fp.write( 'title:\n' ) #fp.write( str( title[0].encode('utf-8') ) ) ##fp.write( ''.join(title[0]).encode('utf-8') ) ##fp.write( '\n' ) ##获取新闻发布时间、写入发布时间 ##time_release_t=response.xpath('//*[@class=\'conText\']/div[@class="summaryNew"]/text() | //div[@class="left-time"]/div[@class="left-t"]/text() | //*[@id=\'k_left\']/div[2]/div/p/span[2]/text() | //*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[5] | /html/body/div[8]/div[1]/div[4]/text() | //*[@id=\'pubtime_baidu\']/text() ').extract() item['time_release'] = [ ''.join(time_release_t).encode('utf-8') ] if len(time_release_t) == 0: item['time_release'] = [''] time_release = ''.join(time_release_t).encode('utf-8') print 'parse_body: time_release: ', time_release #fp.write( 'time_release:\n' ) #fp.write( time_release ) #fp.write('\n') ##response #fp.write( 'response:\n' ) #fp.write( str(response)[5:] ) #fp.write('\n') ##获取摘要、写入摘要 #abstract=response.xpath('//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract() #print 'parse_body: abstract: ',''.join(abstract).encode('utf-8')##abstract 是有可能为空的,故不能给定索引. #item['abstract']=abstract #fp.write( 'abstract:\n' ) #fp.write( ''.join(abstract ).encode('utf-8') ) #fp.write('\n') ##抓取正文 ##bodys_a=response.xpath('//*[contains(@class,\'title\')]') bodys_b = bodys.xpath('.//p') ##print 'bodys_b : ', bodys_b ##bodys_c = bodys_a.xpath('.//p') ##写入正文 ##fp.write('main_body: \n') main_bodys = [] print 'main_body: ' for bod in bodys_b: print 'bod: ', bod main_body = bod.xpath('text()').extract() if len(main_body) != 0: main_body = main_body[0] if (len(main_body) != 0 and len(''.join(main_body)) > 30): print ''.join(main_body).encode('utf-8') ##写入正文各段 #fp.write( ''.join( main_body).encode('utf-8') ) #fp.write('\n') main_bodys.append(main_body) item['mainbody'] = main_bodys ##关闭文件 ##fp.close() print 'finish.' return item else: print 'pl: ', pl print '由于文件已经存在.无操作。' else: item['time_release'] = [''] item['hash'] = [''] print '标题为空。不操作。' print '\n\n'
def _conditional_insert(self, tx, item): re = 0 if len(item['title']) != 0: ##part 1 : 数据库处理 re = tx.execute("select * from baidunews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from qqnews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from sinanews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from chinanews where hash = %s ", (item['hash'][0], )) ##title_a = ''.join( item['title'][0] ).encode('utf-8')##!!必须采用严格的格式要求,保持一致 ##数据库处理 result = tx.fetchone() if re: log.msg("Item already stored in db: %s" % item, level=log.DEBUG) fp_s.write('hash_present: ') fp_s.write(item['hash'][0]) fp_s.write('\n') else: ##条件写入数据库 tx.execute( "insert into baidunews (title,link,time_release,time_add, hash,manufacturer,path,encode) " "values (%s,%s,%s,%s, %s,%s ,%s, %s)", (item['title'], item['link'], item['time_release'], datetime.datetime.now(), item['hash'], item['manufacturer'], item['path'], item['encode'])) fp_d.write('hash_present: ') fp_d.write(item['hash'][0]) fp_d.write('\n') log.msg("Item stored in db: %s" % item, level=log.DEBUG) ##数据文件保存 ##以标题hash为标准 path = '/data/news_data' hash = item['hash'][0] pl = crawl_body.file_check(path, hash) ##条件保存: 仅当数据文件不正常的时候写入数据、或者重写数据 ##pl:0 : 文件不存在 ## 1 :正常 ## 2 :存在,但是过小 if pl != 1: #建立文件路径 old_path = '/data/news_data/baidu_news/' t = time.localtime() t = time.strftime('%Y_%m_%d', t) new_path = os.path.join(old_path, t) if not os.path.isdir(new_path): os.mkdir(new_path) #建立文件名 file = new_path + '/' + ''.join(item['hash']) fileoper = open(file, 'w') fileoper.write('title:\n') fileoper.write(item['title'][0]) fileoper.write('\ntime_release:\n') fileoper.write(''.join(item['time_release'])) fileoper.write('\nlink:\n') fileoper.write(''.join(item['link'])) fileoper.write('\nmainbody:\n') ##fileoper.write( ''.join(item['mainbody']) ) if len(item['mainbody']) >= 1: for bod in item['mainbody']: bod = u(bod, unicode) bod = bod.encode('utf8') fileoper.write(bod) fileoper.close() print '\n\n\n\n'
def parse_body(self,response): print 'done' print 'parse_body: create dir-file,write in...' print 'parse_body: response: ',response item=response.meta['item'] #建立文件路径 old_path='/data/news_data/qq_news/' t=time.localtime() t=time.strftime('%Y_%m_%d',t) new_path=os.path.join(old_path,t) if not os.path.isdir(new_path): os.mkdir(new_path) #建立文件名 file='' hash=0 title=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract() print 'parse_body: title: ',''.join(title).encode('utf-8') if len(title)!=0: ##title_a=''.join(title) title_a=''.join(title[0]).encode('utf-8') ##title_a=str( ( title[0] ) ).encode('utf-8') sha1obj = hashlib.sha1() sha1obj.update(title_a) hash = sha1obj.hexdigest() print 'hash: ',hash file=new_path+'/'+hash##hash 值作为文件名 item['hash']=[hash] ##读取当前路径下文件列表、并判断 res=os.listdir(new_path) print 'parse_body: type(res): ',type(res) print 'parse_body: res: ',res #print 'res:',str(res).encode('utf-8') pl=0 for sh in res: if sh==hash: pl=1 print '文件已经存在:hash:',hash if os.path.getsize(new_path+'/'+''.join(sh) )<200: pl=0 print '文件应存在:但内容过少,需要重写. hash: ',hash print 'pl:',pl ##开始写一个文本 path='/data/news_data/' pl=crawl_body.file_check(path,hash) if pl==0: ##打开文件 print '文件写入开始。hash: ',hash fp=open(file,'w') ##抓取、写入正文标题 ##获取标题、取hash值 title=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract() print 'title: ',''.join(title[0]).encode('utf-8') ##写入标题 fp.write( 'title:\n' ) #fp.write( str( title[0].encode('utf-8') ) ) fp.write( ''.join(title[0]).encode('utf-8') ) fp.write( '\n' ) ##获取新闻发布时间、写入发布时间 time_release_t=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text() | //*[@id="time_source"]/span/text()').extract() item['time_release']= [ ''.join(time_release_t).encode('utf-8') ] if len(time_release_t)==0: item['time_release']=[''] time_release= ''.join(time_release_t).encode('utf-8') print 'parse_body: time_release: ',time_release fp.write( 'time_release:\n' ) fp.write( time_release ) fp.write('\n') ##获取摘要、写入摘要 abstract=response.xpath('//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract() print 'parse_body: abstract: ',''.join(abstract).encode('utf-8')##abstract 是有可能为空的,故不能给定索引. item['abstract']=abstract fp.write( 'abstract:\n' ) fp.write( ''.join(abstract ).encode('utf-8') ) fp.write('\n') ##抓取正文 bodys_a=response.xpath('//div[@id=\'Cnt-Main-Article-QQ\']') bodys_b=bodys_a.xpath('.//p') ##写入正文 fp.write('main_body: \n') print 'main_body: ' fp.write('\n') for bod in bodys_b: main_body=bod.xpath('text()').extract() if len(main_body)!=0: print ''.join(main_body[0]).encode('utf-8') ##写入正文各段 #fp.write( str( main_body[0].encode('utf-8') ) ) fp.write( ''.join( main_body[0]).encode('utf-8') ) fp.write('\n') ##关闭文件 fp.close() print 'finish.' return item else: print 'pl: ',pl print '由于文件已经存在.无操作。' else: item['time_release']=[''] item['hash']=[''] print '标题为空。不操作。' print '\n\n'
def _conditional_insert(self, tx, item): if len(item['title']) == 0 or len(item['mainbody']) == 0: return if len(item['title']) != 0: ## 一、 数据库处理 ##part 1 : 数据库处理 re = tx.execute("select * from baidunews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from qqnews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from sinanews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from chinanews where hash = %s ", (item['hash'][0], )) tt = datetime.datetime.now() print 'type(tt): ', type(tt) if re: log.msg("Item has already been stored in db: %s" % item, level=log.DEBUG) fp_s.write('hash_present: ') fp_s.write(item['hash'][0]) fp_s.write('\n') else: if len(item['hash']) == 0: item['hash'] = [''] print 'pipline: time_release: ', item['time_release'] if len(item['time_release']) == 0: item['time_release'] = [''] tx.execute( "insert into sinanews(title, link, response_news, time_release, time_add, hash, manufacturer)" " values(%s,%s,%s,%s,%s,%s,%s)", (item['title'][0], item['link'], item['response_news'], item['time_release'], tt, item['hash'], item['manufacturer'])) print 'db_store: title: ', ''.join( item['title'][0]).encode('utf-8') print 'ok????' fp_d.write('hash_present: ') fp_d.write(item['hash'][0]) fp_d.write('\n') log.msg("Item is storing in db : %s" % item, level=log.DEBUG) ## 二、数据文件处理 #建立文件路径 old_path = '/data/news_data/sina_news/' hash = item['hash'][0] check_path = '/data/news_data' pl = crawl_body.file_check(check_path, hash) ##pl = 0 if pl == 1: return else: pass t = time.localtime() t = time.strftime('%Y_%m_%d', t) new_path = os.path.join(old_path, t) if not os.path.isdir(new_path): os.mkdir(new_path) #建立文件名 file = new_path + '/' + ''.join(item['hash'][0]) fileoper = open(file, 'w') fileoper.write('title:\n') fileoper.write(item['title'][0]) fileoper.write('link:\n') fileoper.write(item['link'][0]) fileoper.write('\ntime_release:\n') fileoper.write(''.join(item['time_release'])) fileoper.write('\nlink:\n') fileoper.write(''.join(item['link'][0])) fileoper.write('\nmainbody:\n') print 'pipeline: item[\'mainbody\'] :', item['mainbody'] bodys = item['mainbody'] for bd in bodys: u(bd, 'utf8') fileoper.write(bd) fileoper.write('\n') fileoper.close()
def parse_body(self, response): #print 'parse_body: create dir-file,write in...' #print 'response: ',response item = response.meta['item'] #建立文件路径 old_path = '/data/news_data/qq_news/' t = time.localtime() t = time.strftime('%Y_%m_%d', t) new_path = os.path.join(old_path, t) if not os.path.isdir(new_path): os.mkdir(new_path) #建立文件名 #t=time.localtime() #tt=time.strftime('%Y_%m_%d_%X',t).replace(':','_') ##file=new_path+'/'+tt##建立完成##用时间作为文件名 title = response.xpath( '//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract() file = '' #if len(title)!=0: # title_a=str( ( title[0] )[:30] ).encode('utf-8')##取长度为30 # title_b=title_a.replace(' ','_').replace(':','_').replace(':','_') # file=new_path+'/'+title_b ##用标题作为文件名 if len(title) != 0: title_a = ''.join(title[0]).encode('utf-8') ## sha1obj = hashlib.sha1() sha1obj.update(title_a) hash = sha1obj.hexdigest() file = new_path + '/' + hash ##hash 值作为文件名 item['hash'] = [hash] ##读取当前文件列表 #res=os.listdir(new_path) #pl=0##文件不存在 #for sh in res: # if sh==hash: # pl=1##文件存在 # if os.path.getsize(new_path+'/'+sh)<30: # pl=0##文件为空文件 path = '/data/news_data/' pl = crawl_body.file_check(path, hash) if pl == 0: ##文件不存在 #print '文件不存在。开始创建并写入文件.' ##打开文件 fp = open(file, 'w') ##抓取、写入正文标题 #title=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract() #if len(title)!=0: #print 'title: ',title[0].encode('utf-8') ##写入标题 fp.write('title:\n') fp.write(str(title[0].encode('utf-8'))) fp.write('\n') ##获取新闻发布时间、写入发布时间 ##.//*[@id='C-Main-Article-QQ']/div[1]/div[1]/div[1]/span[5] ##.//*[@id='C-Main-Article-QQ']/div[1]/div/div[1]/span[6] #time_release=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()|\ # //*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text() \ # ').extract() time_release = response.xpath( '//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()' ).extract() time_release = ''.join(time_release).encode('utf-8') #print 'parse_body: time_release: ',time_release item['time_release'] = time_release #print 'parse_body: item[\'time_release\'] :',item['time_release'] fp.write('time_release:\n') fp.write(time_release) fp.write('\n') ##获取摘要、写入摘要 abstract = response.xpath( '//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract() #print 'parse_body: abstract: ',''.join(abstract).encode('utf-8')##abstract 是有可能为空的,故不能给定索引. item['abstract'] = abstract fp.write('abstract:\n') fp.write(''.join(abstract).encode('utf-8')) fp.write('\n') ##抓取正文 bodys_a = response.xpath('//div[@id=\'Cnt-Main-Article-QQ\']') bodys_b = bodys_a.xpath('.//p') ##写入正文 fp.write('main_body: \n') fp.write('\n') for bod in bodys_b: main_body = bod.xpath('text()').extract() if len(main_body) != 0: #print 'main_body: ',main_body[0].encode('utf-8') ##写入正文各段 fp.write(str(main_body[0].encode('utf-8'))) fp.write('\n') ##关闭文件 fp.close() return item else: pass #print '文件已经存在。且不需要重新写入.' else: pass
def _conditional_insert(self, tx, item): re=0 now=datetime.datetime.now() ##确定文件名 ##title_a=[''] if len(item['title'])!=0: ##part 1 : 数据库处理 re = tx.execute("select * from baidunews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from qqnews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from sinanews where hash = %s ", (item['hash'][0], )) re += tx.execute("select * from chinanews where hash = %s ", (item['hash'][0], )) ##title_a = ''.join( item['title'][0] ).encode('utf-8')##!!必须采用严格的格式要求,保持一致 ##条件判断 if re != 0 : ##如果数据库中已经有了这个新闻标题就不会再次写入数据库 log.msg("Item already stored in db: %s" % item, level=log.DEBUG) fp_s.write('hash_present: ') fp_s.write( item['hash'][0] ) fp_s.write( '\n' ) else: fp_d.write('hash_present: ') fp_d.write( item['hash'][0] ) fp_d.write( '\n' ) if 1==1: t = datetime.datetime.now() tx.execute( "insert into qqnews(title,link,response_news,time_release,time_add,hash,manufacturer,path ,encode)" " values(%s,%s,%s,%s,%s,%s,%s,%s, %s)" ,(item['title'][0], item['link'],item['response_news'],item['time_release'] ,t,item['hash'],item['manufacturer'],item['path'], item['encode'] ) ) print 'db_store: title: ',''.join(item['title'][0]).encode('utf-8') log.msg("Item stored in db: %s" % item, level=log.DEBUG) path = '/data/news_data' hash = item['hash'] pl = crawl_body.file_check(path,hash) if pl != 1: #建立文件路径 old_path='/data/news_data/qq_news/' t=time.localtime() t=time.strftime('%Y_%m_%d',t) new_path=os.path.join(old_path,t) if not os.path.isdir(new_path): os.mkdir(new_path) #建立文件名 file = new_path+'/'+''.join( item['hash'] ) fileoper = open(file,'w') fileoper.write('title:\n') u( item['title'][0] ,'unicode' ) utf_p( item['title'][0] ,'utf8' ) fileoper.write( item['title'][0] ) fileoper.write('\ntime_release:\n' ) fileoper.write( ''.join(item['time_release']) ) fileoper.write('\nlink:\n') fileoper.write( ''.join(item['link']) ) fileoper.write('\nmainbody:\n') bodys = item['mainbody'] for bd in bodys: u( bd,unicode ) ##utf_p( bd,'utf8' ) fileoper.write( bd ) fileoper.close()
def parse_body(self, response): print 'done' print 'parse_body: create dir-file,write in...' print 'parse_body: response: ', response #sel_a=response.xpath('//div[contains(@id,"newsContent")]') #sel_b=sel_a.xpath('.//ul') #sel_c=sel_b.xpath('.//li') item = response.meta['item'] ##encode encode_0 = response.xpath('/html/head') encode_1 = encode_0.xpath('.//meta') encode_3 = '' for en in encode_1: encode_2 = en.xpath('@content').extract() ##encode_2 = str(encode_2) if len(encode_2) != 0: encode_2 = encode_2[0] if encode_2.find('charset') != -1: encode_3 = encode_2.encode('utf-8') encode_3 = encode_3.strip('text').strip('/').strip('html').strip( '; ').strip('charset=') print 'parse_body: head : encode_3 :', encode_3 item['encode'] = encode_3 #建立文件路径 old_path = '/data/news_data/qq_news/' t = time.localtime() t = time.strftime('%Y_%m_%d', t) new_path = os.path.join(old_path, t) if not os.path.isdir(new_path): os.mkdir(new_path) item['path'] = new_path #建立文件名 file = '' hash = 0 #items=[] title = response.xpath( '//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract() print 'parse_body: title: ', ''.join(title).encode('utf-8') if len(title) != 0: title_a = ''.join(title[0]).encode('utf-8') ##title_a=str( ( title[0] ) ).encode('utf-8') sha1obj = hashlib.sha1() sha1obj.update(title_a) hash = sha1obj.hexdigest() print 'hash: ', hash file = new_path + '/' + hash ##hash 值作为文件名 item['hash'] = [hash] ##读取当前路径下文件列表、并判断 #res=os.listdir(new_path) #print 'parse_body: type(res): ',type(res) #print 'parse_body: res: ',res #pl=0 #for sh in res: # if sh==hash: # pl=1 # print '文件已经存在:hash:',hash # if os.path.getsize(new_path+'/'+''.join(sh) )<200: # pl=0 # print '文件应存在:但内容过少,需要重写. hash: ',hash #print 'pl:',pl # path = '/data/news_data/' pl = crawl_body.file_check(path, hash) pl = 0 ##无条件爬取 ##开始写一个文本 if pl == 0: ##打开文件 print '文件写入开始。hash: ', hash #fp=open(file,'w') ##抓取、写入正文标题 ##获取标题、取hash值 title = response.xpath( '//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract( ) print 'title: ', ''.join(title[0]).encode('utf-8') ##写入标题 #fp.write( 'title:\n' ) #fp.write( str( title[0].encode('utf-8') ) ) #fp.write( ''.join(title[0]).encode('utf-8') ) #fp.write( '\n' ) ##获取新闻发布时间、写入发布时间 time_release = response.xpath( '//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()' ).extract() time_release = ''.join(time_release).encode('utf-8') print 'parse_body: time_release: ', time_release item['time_release'] = time_release #fp.write( 'time_release:\n' ) #fp.write( time_release ) #fp.write('\n') ##获取摘要、写入摘要 abstract = response.xpath( '//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract() print 'parse_body: abstract: ', ''.join(abstract).encode( 'utf-8') ##abstract 是有可能为空的,故不能给定索引. item['abstract'] = abstract #fp.write( 'abstract:\n' ) #fp.write( ''.join(abstract ).encode('utf-8') ) #fp.write('\n') ##抓取正文 bodys_a = response.xpath('//div[@id=\'Cnt-Main-Article-QQ\']') bodys_b = bodys_a.xpath('.//p') ##写入正文 #fp.write('main_body: \n') print 'main_body: ' #fp.write('\n') bodys = [] for bod in bodys_b: main_body = bod.xpath('text()').extract() if len(main_body) != 0: print ''.join(main_body[0]).encode('utf-8') ##写入正文各段 #fp.write( str( main_body[0].encode('utf-8') ) ) #fp.write( ''.join( main_body[0]).encode('utf-8') ) #fp.write('\n') bodys.append(''.join(main_body[0])) item['mainbody'] = bodys #fp.write('finish.\n') ##关闭文件 ##fp.close() print 'finish.' item['mainbody'] = bodys print 'intheend: item = ', item # items.append(item) return item else: print 'pl: ', pl print '由于文件已经存在.无操作。' else: print 'parse_body: 标题为空。不操作。' print '\n\n\n'