def parse_body(self, response):
        print 'done'
        print 'parse_body: create dir-file,write in...'
        print 'parse_body: response: ', response
        item = response.meta['item']

        ##encode
        encode_0 = response.xpath('/html/head')
        encode_1 = encode_0.xpath('.//meta')
        encode_3 = ''
        for en in encode_1:
            encode_2 = en.xpath('@content').extract()
            ##encode_2 = str(encode_2)
            if len(encode_2) != 0:
                encode_2 = encode_2[0]
                if encode_2.find('charset') != -1:
                    encode_3 = encode_2.encode('utf-8')
        encode_3 = encode_3.strip('text').strip('/').strip('html').strip(
            '; ').strip('charset=')
        print 'parse_body: head : encode_3 :', encode_3
        item['encode'] = encode_3

        #建立文件路径
        old_path = '/data/news_data/baidu_news/'
        t = time.localtime()
        t = time.strftime('%Y_%m_%d', t)
        new_path = os.path.join(old_path, t)
        item['path'] = new_path
        if not os.path.isdir(new_path):
            os.mkdir(new_path)
        ##item['path'] = ''.join( new_path )

        #建立文件名
        file = ''
        hash = 0
        bodys = response.xpath('//body')

        title = bodys.xpath(
            './/h1/text()  |  ../*[contains(@*,\'titl\')]').extract()
        ##time_release_t = bodys.xpath(' .//*[ contains(@*,\'time\') ]/text() ' ).extract()
        time_release_t = bodys.xpath(
            ' .//*[ contains(@*,\'time\') ]/text() |  //*[@class=\'conText\']/div[@class="summaryNew"]/text() | //div[@class="left-time"]/div[@class="left-t"]/text()  | //*[@id=\'k_left\']/div[2]/div/p/span[2]/text()  |  //*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[5]   |  /html/body/div[8]/div[1]/div[4]/text() | //*[@id=\'pubtime_baidu\']/text()   '
        ).extract()
        ##title=response.xpath('//*[@class=\'conText\']/h1/text() | //div[@class="content"]/h1/text()  | //div[@class="main"]/h1/text()  | //*[@id=\'C-Main-Article-QQ\']/div[1]/h1   | /html/body/div[8]/div[1]/div[2]/text() | html/body/div[10]/div[1]/div[1]/text() ').extract()
        print 'parse_body: title: ', ''.join(title).encode('utf-8')
        if (len(title) != 0 and len(''.join(title)) > 3):
            ##title_a=''.join(title)
            title_a = ''.join(title[0]).encode('utf-8')
            ##title_a=str( ( title[0] ) ).encode('utf-8')
            sha1obj = hashlib.sha1()
            sha1obj.update(title_a)
            hash = sha1obj.hexdigest()
            print 'hash: ', hash
            file = new_path + '/' + hash  ##hash 值作为文件名
            item['hash'] = [hash]

            ##检查文件是否在已知路径中存在
            path = '/data/news_data'
            pl = crawl_body.file_check(path, hash)
            ##pl=file_check(path,hash)
            ##开始写一个文本
            if pl == 0:
                ##打开文件
                print '文件写入开始。hash: ', hash
                ##fp = open(file,'w')

                ##抓取、写入正文标题
                ##获取标题、取hash值
                ##print 'title: ',''.join(title[0]).encode('utf-8')
                ##写入标题
                ##fp.write( 'title:\n' )
                #fp.write( str( title[0].encode('utf-8') ) )
                ##fp.write( ''.join(title[0]).encode('utf-8') )
                ##fp.write( '\n' )

                ##获取新闻发布时间、写入发布时间
                ##time_release_t=response.xpath('//*[@class=\'conText\']/div[@class="summaryNew"]/text() | //div[@class="left-time"]/div[@class="left-t"]/text()  | //*[@id=\'k_left\']/div[2]/div/p/span[2]/text()  |  //*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[5]   |  /html/body/div[8]/div[1]/div[4]/text() | //*[@id=\'pubtime_baidu\']/text()   ').extract()
                item['time_release'] = [
                    ''.join(time_release_t).encode('utf-8')
                ]
                if len(time_release_t) == 0:
                    item['time_release'] = ['']
                time_release = ''.join(time_release_t).encode('utf-8')
                print 'parse_body: time_release: ', time_release
                #fp.write( 'time_release:\n' )
                #fp.write( time_release )
                #fp.write('\n')

                ##response
                #fp.write( 'response:\n' )
                #fp.write( str(response)[5:] )
                #fp.write('\n')

                ##获取摘要、写入摘要
                #abstract=response.xpath('//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract()
                #print 'parse_body: abstract: ',''.join(abstract).encode('utf-8')##abstract 是有可能为空的,故不能给定索引.
                #item['abstract']=abstract
                #fp.write( 'abstract:\n' )
                #fp.write( ''.join(abstract ).encode('utf-8') )
                #fp.write('\n')

                ##抓取正文
                ##bodys_a=response.xpath('//*[contains(@class,\'title\')]')
                bodys_b = bodys.xpath('.//p')
                ##print 'bodys_b : ', bodys_b
                ##bodys_c = bodys_a.xpath('.//p')
                ##写入正文
                ##fp.write('main_body: \n')
                main_bodys = []
                print 'main_body: '
                for bod in bodys_b:
                    print 'bod: ', bod
                    main_body = bod.xpath('text()').extract()
                    if len(main_body) != 0:
                        main_body = main_body[0]
                        if (len(main_body) != 0
                                and len(''.join(main_body)) > 30):
                            print ''.join(main_body).encode('utf-8')
                            ##写入正文各段
                            #fp.write( ''.join( main_body).encode('utf-8')  )
                            #fp.write('\n')
                            main_bodys.append(main_body)
                item['mainbody'] = main_bodys

                ##关闭文件
                ##fp.close()
                print 'finish.'
                return item

            else:
                print 'pl: ', pl
                print '由于文件已经存在.无操作。'

        else:
            item['time_release'] = ['']
            item['hash'] = ['']
            print '标题为空。不操作。'
        print '\n\n'
Beispiel #2
0
    def _conditional_insert(self, tx, item):
        re = 0
        if len(item['title']) != 0:
            ##part 1 : 数据库处理
            re = tx.execute("select * from baidunews  where hash = %s ",
                            (item['hash'][0], ))
            re += tx.execute("select * from qqnews  where hash = %s ",
                             (item['hash'][0], ))
            re += tx.execute("select * from sinanews  where hash = %s ",
                             (item['hash'][0], ))
            re += tx.execute("select * from chinanews  where hash = %s ",
                             (item['hash'][0], ))
            ##title_a = ''.join(  item['title'][0]  ).encode('utf-8')##!!必须采用严格的格式要求,保持一致
            ##数据库处理
            result = tx.fetchone()
            if re:
                log.msg("Item already stored in db: %s" % item,
                        level=log.DEBUG)
                fp_s.write('hash_present: ')
                fp_s.write(item['hash'][0])
                fp_s.write('\n')
            else:
                ##条件写入数据库
                tx.execute(
                    "insert into baidunews (title,link,time_release,time_add, hash,manufacturer,path,encode) "
                    "values (%s,%s,%s,%s, %s,%s ,%s, %s)",
                    (item['title'], item['link'], item['time_release'],
                     datetime.datetime.now(), item['hash'],
                     item['manufacturer'], item['path'], item['encode']))
                fp_d.write('hash_present: ')
                fp_d.write(item['hash'][0])
                fp_d.write('\n')
            log.msg("Item stored in db: %s" % item, level=log.DEBUG)

            ##数据文件保存
            ##以标题hash为标准
            path = '/data/news_data'
            hash = item['hash'][0]
            pl = crawl_body.file_check(path, hash)
            ##条件保存: 仅当数据文件不正常的时候写入数据、或者重写数据
            ##pl:0 : 文件不存在
            ##   1 :正常
            ##   2 :存在,但是过小
            if pl != 1:
                #建立文件路径
                old_path = '/data/news_data/baidu_news/'
                t = time.localtime()
                t = time.strftime('%Y_%m_%d', t)
                new_path = os.path.join(old_path, t)
                if not os.path.isdir(new_path):
                    os.mkdir(new_path)
                #建立文件名
                file = new_path + '/' + ''.join(item['hash'])
                fileoper = open(file, 'w')
                fileoper.write('title:\n')
                fileoper.write(item['title'][0])
                fileoper.write('\ntime_release:\n')
                fileoper.write(''.join(item['time_release']))
                fileoper.write('\nlink:\n')
                fileoper.write(''.join(item['link']))
                fileoper.write('\nmainbody:\n')
                ##fileoper.write( ''.join(item['mainbody']) )
                if len(item['mainbody']) >= 1:
                    for bod in item['mainbody']:
                        bod = u(bod, unicode)
                        bod = bod.encode('utf8')
                        fileoper.write(bod)
                fileoper.close()

            print '\n\n\n\n'
Beispiel #3
0
    def parse_body(self,response):
        print 'done'
        print 'parse_body: create dir-file,write in...'
        print 'parse_body: response: ',response
        item=response.meta['item']
   
        #建立文件路径
        old_path='/data/news_data/qq_news/'
        t=time.localtime()
        t=time.strftime('%Y_%m_%d',t)
        new_path=os.path.join(old_path,t)
        if not os.path.isdir(new_path):
            os.mkdir(new_path)
    
    
        #建立文件名
        file=''
        hash=0
        title=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract()
        print 'parse_body: title: ',''.join(title).encode('utf-8')
        if len(title)!=0:
            ##title_a=''.join(title)
            title_a=''.join(title[0]).encode('utf-8')
            ##title_a=str( ( title[0] ) ).encode('utf-8')
            sha1obj = hashlib.sha1()
            sha1obj.update(title_a)
            hash = sha1obj.hexdigest()
            print 'hash: ',hash
            file=new_path+'/'+hash##hash 值作为文件名
            item['hash']=[hash]

            ##读取当前路径下文件列表、并判断
            res=os.listdir(new_path)
            print 'parse_body: type(res): ',type(res)
            print 'parse_body: res: ',res
            #print 'res:',str(res).encode('utf-8')
            pl=0
            for sh in res:
                if sh==hash:
                    pl=1
                    print '文件已经存在:hash:',hash
                    if os.path.getsize(new_path+'/'+''.join(sh) )<200:
                        pl=0
                        print '文件应存在:但内容过少,需要重写. hash: ',hash
            print 'pl:',pl

            ##开始写一个文本
            path='/data/news_data/'
            pl=crawl_body.file_check(path,hash)
            if pl==0:
                ##打开文件
                print '文件写入开始。hash: ',hash
                fp=open(file,'w')
        
                ##抓取、写入正文标题
                ##获取标题、取hash值
                title=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract()
                print 'title: ',''.join(title[0]).encode('utf-8')
                ##写入标题
                fp.write( 'title:\n' )
                #fp.write( str( title[0].encode('utf-8') ) )
                fp.write( ''.join(title[0]).encode('utf-8') ) 
                fp.write( '\n' )
        
                ##获取新闻发布时间、写入发布时间
                time_release_t=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()  |  //*[@id="time_source"]/span/text()').extract()
                item['time_release']= [  ''.join(time_release_t).encode('utf-8') ]
                if len(time_release_t)==0:
                    item['time_release']=['']
                time_release= ''.join(time_release_t).encode('utf-8')
                print 'parse_body: time_release: ',time_release
                fp.write( 'time_release:\n' )
                fp.write( time_release )
                fp.write('\n')
        
                ##获取摘要、写入摘要 
                abstract=response.xpath('//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract()
                print 'parse_body: abstract: ',''.join(abstract).encode('utf-8')##abstract 是有可能为空的,故不能给定索引.
                item['abstract']=abstract
                fp.write( 'abstract:\n' )
                fp.write( ''.join(abstract ).encode('utf-8') )
                fp.write('\n')
        
                ##抓取正文
                bodys_a=response.xpath('//div[@id=\'Cnt-Main-Article-QQ\']')
                bodys_b=bodys_a.xpath('.//p')
                ##写入正文
                fp.write('main_body: \n')
                print 'main_body: '
                fp.write('\n')
                for bod in bodys_b:
                    main_body=bod.xpath('text()').extract()
                    if len(main_body)!=0:
                        print ''.join(main_body[0]).encode('utf-8')
                        ##写入正文各段
                        #fp.write( str( main_body[0].encode('utf-8') ) )
                        fp.write( ''.join( main_body[0]).encode('utf-8')  )
                        fp.write('\n')
    
                ##关闭文件
                fp.close()
                print 'finish.'
                return item
    
            else:
                print 'pl: ',pl
                print '由于文件已经存在.无操作。'
    
        else:
            item['time_release']=['']
            item['hash']=['']
            print '标题为空。不操作。'
        print '\n\n'
    def _conditional_insert(self, tx, item):
        if len(item['title']) == 0 or len(item['mainbody']) == 0:
            return
        if len(item['title']) != 0:
            ##  一、 数据库处理
            ##part 1 : 数据库处理
            re = tx.execute("select * from baidunews  where hash = %s ",
                            (item['hash'][0], ))
            re += tx.execute("select * from qqnews  where hash = %s ",
                             (item['hash'][0], ))
            re += tx.execute("select * from sinanews  where hash = %s ",
                             (item['hash'][0], ))
            re += tx.execute("select * from chinanews  where hash = %s ",
                             (item['hash'][0], ))
            tt = datetime.datetime.now()
            print 'type(tt): ', type(tt)
            if re:
                log.msg("Item has already been stored in db: %s" % item,
                        level=log.DEBUG)
                fp_s.write('hash_present: ')
                fp_s.write(item['hash'][0])
                fp_s.write('\n')
            else:
                if len(item['hash']) == 0:
                    item['hash'] = ['']
                print 'pipline: time_release: ', item['time_release']
                if len(item['time_release']) == 0:
                    item['time_release'] = ['']

                tx.execute(
                    "insert into sinanews(title, link, response_news, time_release, time_add, hash, manufacturer)"
                    " values(%s,%s,%s,%s,%s,%s,%s)",
                    (item['title'][0], item['link'], item['response_news'],
                     item['time_release'], tt, item['hash'],
                     item['manufacturer']))
                print 'db_store: title: ', ''.join(
                    item['title'][0]).encode('utf-8')
                print 'ok????'

                fp_d.write('hash_present: ')
                fp_d.write(item['hash'][0])
                fp_d.write('\n')
                log.msg("Item is storing in db : %s" % item, level=log.DEBUG)

            ## 二、数据文件处理
            #建立文件路径
            old_path = '/data/news_data/sina_news/'
            hash = item['hash'][0]
            check_path = '/data/news_data'
            pl = crawl_body.file_check(check_path, hash)
            ##pl = 0
            if pl == 1:
                return
            else:
                pass
            t = time.localtime()
            t = time.strftime('%Y_%m_%d', t)
            new_path = os.path.join(old_path, t)
            if not os.path.isdir(new_path):
                os.mkdir(new_path)
            #建立文件名
            file = new_path + '/' + ''.join(item['hash'][0])
            fileoper = open(file, 'w')
            fileoper.write('title:\n')
            fileoper.write(item['title'][0])
            fileoper.write('link:\n')
            fileoper.write(item['link'][0])
            fileoper.write('\ntime_release:\n')
            fileoper.write(''.join(item['time_release']))
            fileoper.write('\nlink:\n')
            fileoper.write(''.join(item['link'][0]))
            fileoper.write('\nmainbody:\n')
            print 'pipeline:  item[\'mainbody\'] :', item['mainbody']
            bodys = item['mainbody']
            for bd in bodys:
                u(bd, 'utf8')
                fileoper.write(bd)
                fileoper.write('\n')
            fileoper.close()
Beispiel #5
0
    def parse_body(self, response):
        #print 'parse_body: create dir-file,write in...'
        #print 'response: ',response
        item = response.meta['item']

        #建立文件路径
        old_path = '/data/news_data/qq_news/'
        t = time.localtime()
        t = time.strftime('%Y_%m_%d', t)
        new_path = os.path.join(old_path, t)
        if not os.path.isdir(new_path):
            os.mkdir(new_path)

        #建立文件名
        #t=time.localtime()
        #tt=time.strftime('%Y_%m_%d_%X',t).replace(':','_')
        ##file=new_path+'/'+tt##建立完成##用时间作为文件名
        title = response.xpath(
            '//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract()
        file = ''
        #if len(title)!=0:
        #    title_a=str( ( title[0] )[:30] ).encode('utf-8')##取长度为30
        #    title_b=title_a.replace(' ','_').replace(':','_').replace(':','_')
        #    file=new_path+'/'+title_b ##用标题作为文件名
        if len(title) != 0:
            title_a = ''.join(title[0]).encode('utf-8')  ##
            sha1obj = hashlib.sha1()
            sha1obj.update(title_a)
            hash = sha1obj.hexdigest()
            file = new_path + '/' + hash  ##hash 值作为文件名
            item['hash'] = [hash]

            ##读取当前文件列表
            #res=os.listdir(new_path)
            #pl=0##文件不存在
            #for sh in res:
            #    if sh==hash:
            #        pl=1##文件存在
            #        if os.path.getsize(new_path+'/'+sh)<30:
            #            pl=0##文件为空文件
            path = '/data/news_data/'
            pl = crawl_body.file_check(path, hash)
            if pl == 0:  ##文件不存在
                #print '文件不存在。开始创建并写入文件.'
                ##打开文件
                fp = open(file, 'w')

                ##抓取、写入正文标题
                #title=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract()
                #if len(title)!=0:
                #print 'title: ',title[0].encode('utf-8')

                ##写入标题
                fp.write('title:\n')
                fp.write(str(title[0].encode('utf-8')))
                fp.write('\n')

                ##获取新闻发布时间、写入发布时间
                ##.//*[@id='C-Main-Article-QQ']/div[1]/div[1]/div[1]/span[5]
                ##.//*[@id='C-Main-Article-QQ']/div[1]/div/div[1]/span[6]
                #time_release=response.xpath('//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()|\
                #       //*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()  \
                #       ').extract()
                time_release = response.xpath(
                    '//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()'
                ).extract()
                time_release = ''.join(time_release).encode('utf-8')
                #print 'parse_body: time_release: ',time_release
                item['time_release'] = time_release
                #print 'parse_body: item[\'time_release\'] :',item['time_release']
                fp.write('time_release:\n')
                fp.write(time_release)
                fp.write('\n')

                ##获取摘要、写入摘要
                abstract = response.xpath(
                    '//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract()
                #print 'parse_body: abstract: ',''.join(abstract).encode('utf-8')##abstract 是有可能为空的,故不能给定索引.
                item['abstract'] = abstract
                fp.write('abstract:\n')
                fp.write(''.join(abstract).encode('utf-8'))
                fp.write('\n')

                ##抓取正文
                bodys_a = response.xpath('//div[@id=\'Cnt-Main-Article-QQ\']')
                bodys_b = bodys_a.xpath('.//p')

                ##写入正文
                fp.write('main_body: \n')
                fp.write('\n')
                for bod in bodys_b:
                    main_body = bod.xpath('text()').extract()
                    if len(main_body) != 0:
                        #print 'main_body: ',main_body[0].encode('utf-8')
                        ##写入正文各段
                        fp.write(str(main_body[0].encode('utf-8')))
                        fp.write('\n')

                ##关闭文件
                fp.close()
                return item
            else:
                pass
                #print '文件已经存在。且不需要重新写入.'
        else:
            pass
Beispiel #6
0
    def _conditional_insert(self, tx, item):
        re=0
        now=datetime.datetime.now()

        ##确定文件名
        ##title_a=['']
        if len(item['title'])!=0:
            ##part 1 : 数据库处理
            re = tx.execute("select * from baidunews  where hash = %s ", (item['hash'][0], ))
            re += tx.execute("select * from qqnews  where hash = %s ", (item['hash'][0], ))
            re += tx.execute("select * from sinanews  where hash = %s ", (item['hash'][0], ))
            re += tx.execute("select * from chinanews  where hash = %s ", (item['hash'][0], ))
            ##title_a = ''.join(  item['title'][0]  ).encode('utf-8')##!!必须采用严格的格式要求,保持一致
            ##条件判断
            if re != 0 :
                ##如果数据库中已经有了这个新闻标题就不会再次写入数据库
                log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
                fp_s.write('hash_present: ')
                fp_s.write( item['hash'][0] )
                fp_s.write( '\n' )
            else:
                fp_d.write('hash_present: ')
                fp_d.write( item['hash'][0] )
                fp_d.write( '\n' )
                if 1==1:
                    t = datetime.datetime.now()
                    tx.execute( "insert into qqnews(title,link,response_news,time_release,time_add,hash,manufacturer,path ,encode)"
                            " values(%s,%s,%s,%s,%s,%s,%s,%s, %s)" ,(item['title'][0], item['link'],item['response_news'],item['time_release'] ,t,item['hash'],item['manufacturer'],item['path'], item['encode'] )   )
                    print 'db_store: title: ',''.join(item['title'][0]).encode('utf-8')
                    log.msg("Item stored in db: %s" % item, level=log.DEBUG)
            

            path = '/data/news_data'
            hash = item['hash']
            pl = crawl_body.file_check(path,hash)
            if pl != 1:
                #建立文件路径
                old_path='/data/news_data/qq_news/'
                t=time.localtime()
                t=time.strftime('%Y_%m_%d',t)
                new_path=os.path.join(old_path,t)
                if not os.path.isdir(new_path):
                    os.mkdir(new_path)
                        
                #建立文件名
                file = new_path+'/'+''.join( item['hash'] )
                fileoper = open(file,'w')
                fileoper.write('title:\n')
                u( item['title'][0] ,'unicode' )
                utf_p( item['title'][0] ,'utf8' )
                fileoper.write( item['title'][0]  )
                fileoper.write('\ntime_release:\n'  )
                fileoper.write( ''.join(item['time_release']) )
                fileoper.write('\nlink:\n')
                fileoper.write( ''.join(item['link']) )
                fileoper.write('\nmainbody:\n')
                bodys = item['mainbody']
                for bd in bodys:
                    u( bd,unicode )
                    ##utf_p( bd,'utf8' )
                    fileoper.write( bd   )
                fileoper.close()
Beispiel #7
0
    def parse_body(self, response):
        print 'done'
        print 'parse_body: create dir-file,write in...'
        print 'parse_body: response: ', response
        #sel_a=response.xpath('//div[contains(@id,"newsContent")]')
        #sel_b=sel_a.xpath('.//ul')
        #sel_c=sel_b.xpath('.//li')
        item = response.meta['item']

        ##encode
        encode_0 = response.xpath('/html/head')
        encode_1 = encode_0.xpath('.//meta')
        encode_3 = ''
        for en in encode_1:
            encode_2 = en.xpath('@content').extract()
            ##encode_2 = str(encode_2)
            if len(encode_2) != 0:
                encode_2 = encode_2[0]
                if encode_2.find('charset') != -1:
                    encode_3 = encode_2.encode('utf-8')
        encode_3 = encode_3.strip('text').strip('/').strip('html').strip(
            '; ').strip('charset=')
        print 'parse_body: head : encode_3 :', encode_3
        item['encode'] = encode_3

        #建立文件路径
        old_path = '/data/news_data/qq_news/'
        t = time.localtime()
        t = time.strftime('%Y_%m_%d', t)
        new_path = os.path.join(old_path, t)
        if not os.path.isdir(new_path):
            os.mkdir(new_path)
        item['path'] = new_path

        #建立文件名
        file = ''
        hash = 0
        #items=[]
        title = response.xpath(
            '//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract()
        print 'parse_body: title: ', ''.join(title).encode('utf-8')
        if len(title) != 0:
            title_a = ''.join(title[0]).encode('utf-8')
            ##title_a=str( ( title[0] ) ).encode('utf-8')
            sha1obj = hashlib.sha1()
            sha1obj.update(title_a)
            hash = sha1obj.hexdigest()
            print 'hash: ', hash
            file = new_path + '/' + hash  ##hash 值作为文件名
            item['hash'] = [hash]

            ##读取当前路径下文件列表、并判断
            #res=os.listdir(new_path)
            #print 'parse_body: type(res): ',type(res)
            #print 'parse_body: res: ',res
            #pl=0
            #for sh in res:
            #    if sh==hash:
            #        pl=1
            #        print '文件已经存在:hash:',hash
            #        if os.path.getsize(new_path+'/'+''.join(sh) )<200:
            #            pl=0
            #            print '文件应存在:但内容过少,需要重写. hash: ',hash
            #print 'pl:',pl
            #
            path = '/data/news_data/'
            pl = crawl_body.file_check(path, hash)
            pl = 0  ##无条件爬取
            ##开始写一个文本
            if pl == 0:
                ##打开文件
                print '文件写入开始。hash: ', hash
                #fp=open(file,'w')

                ##抓取、写入正文标题
                ##获取标题、取hash值
                title = response.xpath(
                    '//*[@id=\'C-Main-Article-QQ\']/div[1]/h1/text()').extract(
                    )
                print 'title: ', ''.join(title[0]).encode('utf-8')
                ##写入标题
                #fp.write( 'title:\n' )
                #fp.write( str( title[0].encode('utf-8') ) )
                #fp.write( ''.join(title[0]).encode('utf-8') )
                #fp.write( '\n' )

                ##获取新闻发布时间、写入发布时间
                time_release = response.xpath(
                    '//*[@id=\'C-Main-Article-QQ\']/div[1]/div[1]/div[1]/span[@class="article-time"]/text()'
                ).extract()
                time_release = ''.join(time_release).encode('utf-8')
                print 'parse_body: time_release: ', time_release
                item['time_release'] = time_release
                #fp.write( 'time_release:\n' )
                #fp.write( time_release )
                #fp.write('\n')

                ##获取摘要、写入摘要
                abstract = response.xpath(
                    '//*[@id=\'Cnt-Main-Article-QQ\']/p[1]/text()').extract()
                print 'parse_body: abstract: ', ''.join(abstract).encode(
                    'utf-8')  ##abstract 是有可能为空的,故不能给定索引.
                item['abstract'] = abstract
                #fp.write( 'abstract:\n' )
                #fp.write( ''.join(abstract ).encode('utf-8') )
                #fp.write('\n')

                ##抓取正文
                bodys_a = response.xpath('//div[@id=\'Cnt-Main-Article-QQ\']')
                bodys_b = bodys_a.xpath('.//p')
                ##写入正文
                #fp.write('main_body: \n')
                print 'main_body: '
                #fp.write('\n')
                bodys = []
                for bod in bodys_b:
                    main_body = bod.xpath('text()').extract()
                    if len(main_body) != 0:
                        print ''.join(main_body[0]).encode('utf-8')
                        ##写入正文各段
                        #fp.write( str( main_body[0].encode('utf-8') ) )
                        #fp.write( ''.join( main_body[0]).encode('utf-8')  )
                        #fp.write('\n')
                        bodys.append(''.join(main_body[0]))
                item['mainbody'] = bodys
                #fp.write('finish.\n')

                ##关闭文件
                ##fp.close()
                print 'finish.'
                item['mainbody'] = bodys
                print 'intheend: item = ', item
                #        items.append(item)
                return item

            else:
                print 'pl: ', pl
                print '由于文件已经存在.无操作。'

        else:
            print 'parse_body: 标题为空。不操作。'
        print '\n\n\n'