Ejemplo n.º 1
0
 def parse_detail(self,response):
     item = response.meta['meta_1']
     content = response.xpath('//*[@id="articleC"]')
     if content != []:
         content = content.extract_first()
         next_page = response.xpath('//div[@class="page"]')
         if next_page != []:
             next_url = response.xpath('//*[@id="nextPage"]/a/@href').extract_first()
             url = urljoin(response.url, next_url)
             data = requests.get(url)
             content_1= re.findall(r'<div id="articleC" class="article_con" >(.*?)</div>',data.text,re.S)
             content = content+content_1[0]
         author1 = response.xpath('//*[@id="laiyuan_mp"]/a/span/text()')
         author2 = response.xpath('//*[@id="laiyuan"]/span')
         author = author1.extract_first()
         if author is None:
             author = author2.xpath('string(.)').extract_first()
             author = author.split(':')[1].strip()
         item['Author'] = author
         str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + str(
             item['News_Dt'].split(' ')[0]) + '</time></div>'
         content1 = '<h1>' + item[
             'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
         item['Content'] = content1
         content2 = etree.HTML(content1)
         img_list = content2.xpath('//img/@src')
         get_pic(item, img_list)
         item['Update_Tm'] = get_time_stamp()
         item['Web_Id'] = '5-32'
         yield item
Ejemplo n.º 2
0
 def parse_detail(self, response):
     content = response.xpath('//div[@class="newscontxt"]')
     item = response.meta['meta_1']
     if content != []:
         content = content.extract_first()
         keywords = response.xpath('//div[@class="newscontxt"]//h4//a'
                                   ).xpath('string(.)').extract()
         item['Keywords'] = ','.join(keywords)
         author1 = 'EEWORLD'
         author = response.xpath(
             '//*[@id="newsptit"]/div[1]/div/h6/span[2]/text()')
         if author != []:
             author = author.extract_first()
             author_1 = author.split(':')
             if len(author_1) < 2:
                 author_1 = author.split(':')
             author = author_1[1]
             if author != ' ':
                 author1 = author
         item['Author'] = author1
         str_time = '<div class="explain"><span>' + item[
             'Author'] + '</span><time>' + str(
                 item['News_Dt'].split(' ')[0]) + '</time></div>'
         content1 = '<h1>' + item[
             'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
         item['Content'] = content1
         content2 = etree.HTML(content1)
         img_list = content2.xpath('//img/@src')
         get_pic(item, img_list)
         item['Update_Tm'] = get_time_stamp()
         item['Web_Id'] = '5-30'
         #print(item)
         yield item
Ejemplo n.º 3
0
    def parse_detail(self, response):
        content = response.xpath('//div[@class="contentlist"]')
        if content != []:
            item = EleIndustryItem()
            content = content.extract_first()
            title = response.xpath('//h1/text()')
            if title != []:
                title = title.extract_first().strip()
                node1 = response.xpath('//div[@class="newstitle"]/span/text()')
                if node1 != []:
                    node2 = node1.extract_first()
                    time_1 = re.findall(r'\d+-\d+-\d+', node2)
                    if time_1 != []:
                        time1 = time_1[0] + ' ' + Get_Time()
                        try:
                            author = node2.split(',')[1].strip()
                        except Exception as E:
                            author = ''
                        node3 = response.xpath(
                            '//div[@class="newstitle"]/span/a/text()')
                        keywords = ''
                        if node3 != []:
                            keywords = node3.extract_first()

                        item['News_Title'] = title
                        item['News_Dt'] = time1
                        item['Author'] = author
                        item['Keywords'] = keywords
                        str_time = '<div class="explain"><span>' + item[
                            'Author'] + '</span><time>' + str(
                                item['News_Dt'].split(
                                    ' ')[0]) + '</time></div>'
                        content1 = '<h1>' + item[
                            'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                        data = change_content(content1, 'http://www.ic37.com')
                        item['Content'] = data[0]
                        img_list = data[1]
                        get_pic(item, img_list)
                        item['Update_Tm'] = get_time_stamp()
                        item['Abstract'] = ''
                        item['URL'] = response.url
                        item['Web_Id'] = '5-37'
                        #print(item)
                        yield item
Ejemplo n.º 4
0
 def parse_detail(self, response):
     content = response.xpath('//div[@class="article-body"]')
     if content != []:
         item = EleIndustryItem()
         content = content.extract_first()
         title = response.xpath('//h1[@class="headline"]/text()')
         if title != []:
             title = title.extract_first().strip()
             # print(title)
             node1 = response.xpath(
                 '//div[@class="muted subline"]//span[@class="mr20"]/text()'
             )
             if node1 != []:
                 node2 = node1.extract()
                 time1 = node2[0].strip().split(' ')[0] + ' ' + Get_Time()
                 author = ''
                 if len(node2) == 2:
                     author = node2[1].strip()
                 node3 = response.xpath(
                     '//div[@class="overhide overhidden new-tags"]/a/text()'
                 )
                 keywords = ''
                 if node3 != []:
                     keywords = ','.join(node3.extract())
                 item['News_Title'] = title
                 item['News_Dt'] = time1
                 item['Author'] = author
                 item['Keywords'] = keywords
                 str_time = '<div class="explain"><span>' + item[
                     'Author'] + '</span><time>' + str(
                         item['News_Dt'].split(' ')[0]) + '</time></div>'
                 content1 = '<h1>' + item[
                     'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                 item['Content'] = content1
                 content2 = etree.HTML(item['Content'])
                 img_list = content2.xpath('//img/@src')
                 get_pic(item, img_list)
                 item['Update_Tm'] = get_time_stamp()
                 item['Abstract'] = ''
                 item['URL'] = response.url
                 item['Web_Id'] = '5-31'
                 #print(item)
                 yield item
Ejemplo n.º 5
0
 def parse_item1(self, response):
     # print(response.url)
     # print(response.url)
     content = response.xpath('//*[@id="art_body"]')
     if content != []:
         content = content.extract_first()
         title = response.xpath('//h1')
         if title != []:
             item = EleIndustryItem()
             title = title.xpath('string(.)').extract_first().strip()
             time1 = response.xpath('//span[@class="time"]')
             if time1 != []:
                 time1 = re.findall(r'\d+-\d+-\d+', time1.extract_first())
                 if time1 != []:
                     time1 = time1[0] + ' ' + Get_Time()
                     author = response.xpath(
                         '//span[@class="zuozhe"]').xpath(
                             'string(.)').extract_first()
                     if author == '作者:':
                         author = ''
                     Keywords = response.xpath(
                         '//span[@class="mbx"]/text()').extract_first()
                     item['News_Title'] = title
                     item['News_Dt'] = time1
                     item['Author'] = author
                     item['Keywords'] = Keywords
                     str_time = '<div class="explain"><span>' + item[
                         'Author'] + '</span><time>' + str(
                             item['News_Dt'].split(
                                 ' ')[0]) + '</time></div>'
                     content1 = '<h1>' + item[
                         'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                     item['Content'] = content1
                     content2 = etree.HTML(item['Content'])
                     img_list = content2.xpath('//img/@src')
                     get_pic(item, img_list)
                     item['Update_Tm'] = get_time_stamp()
                     item['Abstract'] = ''
                     item['URL'] = response.url
                     item['Web_Id'] = '5-34'
                     #print(item)
                     yield item
Ejemplo n.º 6
0
 def parse_item(self,response):
     content = response.xpath('//*[@id="articlebody"]')
     if content != []:
         item = EleIndustryItem()
         content = content.extract_first()
         node = re.findall(r'<center>.*?</center>', content, re.S)
         if node != []:
             content = content.replace(node[0],'')
         title = response.xpath('//*[@id="articledetail"]/h1/text()')
         if title != []:
             title = title.extract_first().strip()
             node1 = response.xpath('//*[@id="articledetail"]/p/span[2]/text()')
             if node1 != []:
                 node2 = node1.extract_first().split('\xa0\xa0')
                 time1 = re.findall(r'\d+-\d+-\d+',str(node1.extract_first))
                 if time1 != []:
                     time1 = time1[0]+ ' ' +  Get_Time()
                     author = re.findall(r'作者:(.*)',node2[1])
                     author1 = ''
                     if author != []:
                         if author[0].strip() != 'n':
                             author1 = author[0].strip()
                     node3 = response.xpath('//*[@id="navigation"]/p/a[3]/text()')
                     keywords = node3.extract_first()
                     item['News_Title'] = title
                     item['News_Dt'] = time1
                     item['Author'] = author1
                     item['Keywords'] = keywords
                     str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + str(
                         item['News_Dt'].split(' ')[0]) + '</time></div>'
                     content1 = '<h1>' + item[
                         'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                     item['Content'] = content1
                     content2 = etree.HTML(item['Content'])
                     img_list = content2.xpath('//img/@src')
                     get_pic(item, img_list)
                     item['Update_Tm'] = get_time_stamp()
                     item['Abstract'] = ''
                     item['URL'] = response.url
                     item['Web_Id'] = '5-33'
                     #print(item)
                     yield item
Ejemplo n.º 7
0
 def parse_item1(self, response):
     content = response.xpath('//div[@class="content"]')
     if content != []:
         item = EleIndustryItem()
         content = content.extract_first()
         title = response.xpath('//h2/text()')
         if title != []:
             title = title.extract_first()
             time1 = response.xpath(
                 '//div[@class="title-box"]/p/span[1]/text()')
             if time1 != []:
                 time1 = time1.extract_first().split(
                     ':')[1] + ' ' + Get_Time()
                 author = response.xpath(
                     '//div[@class="title-box"]/p/span[2]/text()')
                 author1 = ''
                 if author != []:
                     author1 = author.extract_first().split(':')[1]
                     if author1.endswith('.com'):
                         author1 = ''
                 item['News_Title'] = title
                 item['News_Dt'] = time1
                 item['Author'] = author1
                 item['Keywords'] = ''
                 str_time = '<div class="explain"><span>' + item[
                     'Author'] + '</span><time>' + str(
                         item['News_Dt'].split(' ')[0]) + '</time></div>'
                 content1 = '<h1>' + item[
                     'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                 data = change_content(content1, self.start_urls[0])
                 item['Content'] = data[0]
                 # content2 = etree.HTML(item['Content'])
                 img_list = data[1]
                 get_pic(item, img_list)
                 item['Update_Tm'] = get_time_stamp()
                 item['Abstract'] = ''
                 item['URL'] = response.url
                 item['Web_Id'] = '5-26'
                 yield item
Ejemplo n.º 8
0
 def parse_item(self, response):
     content = response.xpath('//*[@id="contentDiv"]')
     if content != []:
         content = content.extract_first()
         title = response.xpath('//h1/text()')
         if title != []:
             title = title.extract_first()
             time1 = response.xpath(
                 '//div[@class="authorTimeSource"]/span[2]/text()')
             if time1 != []:
                 item = EleIndustryItem()
                 time1 = time1.extract_first().split(':')[1]
                 time1 = time1 + ' ' + Get_Time()
                 author = response.xpath(
                     '//div[@class="authorTimeSource"]/span[3]/text()'
                 ).extract_first()
                 author = author.split(':')[1]
                 tags = response.xpath('//div[@class="keyWord"]//em').xpath(
                     'string(.)').extract()
                 key = ','.join(tags)
                 item['News_Title'] = title
                 item['News_Dt'] = time1
                 item['Author'] = author
                 item['Keywords'] = key
                 str_time = '<div class="explain"><span>' + item[
                     'Author'] + '</span><time>' + str(
                         item['News_Dt'].split(' ')[0]) + '</time></div>'
                 content1 = '<h1>' + item[
                     'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                 data = change_content(content1, self.start_urls[0])
                 item['Content'] = data[0]
                 img_list = data[1]
                 get_pic(item, img_list)
                 item['Update_Tm'] = get_time_stamp()
                 item['Abstract'] = ''
                 item['URL'] = response.url
                 item['Web_Id'] = '5-27'
                 yield item
Ejemplo n.º 9
0
 def parse_detail(self,response):
     item = response.meta['meta_1']
     content = response.xpath('//*[@id="newsInfo"]')
     if content != []:
         content = content.extract_first()
         item['Author'] = '华强资讯'
         item['URL'] = response.url
         str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + str(
             item['News_Dt'].split(' ')[0]) + '</time></div>'
         content1 = '<h1>' + item[
             'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
         data = change_content(content1,'http://news.hqew.com')
         item['Content'] = data[0]
         if data[1]!=[]:
             data[1].pop()
         if data[1] == []:
             data[1].append(item['Image_URL'])
         img_list = data[1]
         get_pic(item, img_list)
         item['Update_Tm'] = get_time_stamp()
         item['Web_Id'] = '5-36'
         #print(item)
         yield item
Ejemplo n.º 10
0
 def parse_item1(self, response):
     content = response.xpath('//div[@class="detailcon"]')
     if content != []:
         item = EleIndustryItem()
         content = content.extract_first()
         title = response.xpath('//div[@class=" detailtitle"]/text()')
         if title != []:
             title = title.extract_first().strip()
             node = response.xpath('//div[@class="detailintro"]').xpath(
                 'string(.)')
             if node != []:
                 node = node.extract_first()
                 node = node.split('|')
                 author = node[0]
                 time1 = re.compile(r'\d+-\d+-\d+', re.S).findall(node[1])
                 if time1 != []:
                     time1 = time1[0] + ' ' + Get_Time()
                     item['News_Title'] = title
                     item['News_Dt'] = time1
                     item['Author'] = author
                     item['Keywords'] = ''
                     str_time = '<div class="explain"><span>' + item[
                         'Author'] + '</span><time>' + str(
                             item['News_Dt'].split(
                                 ' ')[0]) + '</time></div>'
                     content1 = '<h1>' + item[
                         'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                     data = change_content(content1, self.start_urls[0])
                     item['Content'] = data[0]
                     img_list = data[1]
                     get_pic(item, img_list)
                     item['Update_Tm'] = get_time_stamp()
                     item['Abstract'] = ''
                     item['URL'] = response.url
                     item['Web_Id'] = '5-27'
                     yield item
Ejemplo n.º 11
0
    def parse_item(self, response):
        content = response.xpath('//div[@class="simditor-body clearfix"]')
        if content == []:
            content1 = response.xpath('//div[@class="pct"]')
            if content1 != []:
                content1 = content1.extract_first()
                time1 = response.xpath(
                    '//div[@class="bar_tip float_l"]/em/span/@title')
                if time1 == []:
                    time1 = response.xpath(
                        '//div[@class="bar_tip float_l"]/em/text()')
                if time1 != []:
                    time1 = time1.extract_first()
                    time1 = re.findall(r'(\d+-\d+-\d+)', time1)
                    if time1 != []:
                        title = response.xpath(
                            '//*[@id="thread_subject"]/text()')
                        if title != []:
                            item = EleIndustryItem()
                            title = title.extract_first()
                            item['News_Title'] = title
                            time1 = time1[0]
                            time1 = time1 + ' ' + Get_Time()
                            item['News_Dt'] = time1
                            author = response.xpath(
                                '//div[@class="bar_tip float_l"]/div/span/a/span/text()'
                            )
                            author1 = ''
                            if author != []:
                                author1 = author.extract_first()
                            item['Author'] = author1
                            str_time = '<div class="explain"><span>' + item[
                                'Author'] + '</span><time>' + str(
                                    item['News_Dt'].split(
                                        ' ')[0]) + '</time></div>'
                            content1 = '<h1>' + item[
                                'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content1 + "</div>"
                            data = change_content(content1, self.start_urls[0])
                            item['Content'] = data[0]
                            # content2 = etree.HTML(item['Content'])
                            img_list = data[1]
                            get_pic(item, img_list)
                            item['Update_Tm'] = get_time_stamp()
                            item['Abstract'] = ''
                            item['Keywords'] = ''
                            item['URL'] = response.url
                            item['Web_Id'] = '5-24'
                            #print(item)
                            yield item

            if content1 == []:
                content2 = response.xpath('//div[@class="article-content"]')
                content3 = response.xpath('//div[@class="author_des"]')
                if content2 != []:
                    content2 = content2.extract_first()
                    title = response.xpath('//h1/text()')
                    if title != []:
                        title = title.extract_first()
                        time1 = response.xpath(
                            '//div[@class="fl"]/em[3]/text()')
                        if time1 != []:
                            item = EleIndustryItem()
                            time1 = time1.extract_first().replace(
                                '年', '-').replace('月', '-').replace('日', '')
                            time1 = time1.strip().split(' ')[0]
                            time1 = time1 + ' ' + Get_Time()
                            author = response.xpath(
                                '//div[@class="fl"]/em[1]/text()')
                            author1 = ''
                            if author != []:
                                author1 = author.extract_first()
                            tag = response.xpath(
                                '//div[@class="tag"]//span').xpath(
                                    'string(.)').extract()
                            tags = ','.join(tag)
                            item['News_Title'] = title
                            item['Author'] = author1
                            item['News_Dt'] = time1
                            item['Keywords'] = tags

                            str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \
                                       item['News_Dt'].split(' ')[0] + '</time></div>'
                            content_1 = '<h1>' + item[
                                'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content2 + "</div>"
                            data = change_content(content_1,
                                                  self.start_urls[0])
                            item['Content'] = data[0]
                            # content2 = etree.HTML(item['Content'])
                            img_list = data[1]
                            get_pic(item, img_list)
                            item['Update_Tm'] = get_time_stamp()
                            item['Abstract'] = ''
                            item['URL'] = response.url
                            item['Web_Id'] = '5-24'
                            #print(item)
                            yield item

                if content3 != []:
                    content3 = content3.extract_first()
                    title = response.xpath('//h1/text()')
                    if title != []:
                        title = title.extract_first()
                        span_node = response.xpath(
                            '//span[@class="float_left font-small color_gray"]'
                        ).xpath('string(.)').extract_first()
                        span_text = span_node.split(' ')
                        if len(span_text) == 3:
                            item = EleIndustryItem()
                            author = span_text[0].replace('\r', '').replace(
                                '\n', '').replace('\t', '').replace('发表于', '')
                            time1 = span_text[1] + ' ' + Get_Time()
                            tag = response.xpath(
                                '//ul[@class="article_tags clearfix"]/li/span'
                            ).xpath('string(.)').extract()
                            tags = ','.join(tag)
                            item['News_Title'] = title
                            item['Author'] = author
                            item['News_Dt'] = time1
                            item['Keywords'] = tags
                            str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \
                                       item['News_Dt'].split(' ')[0] + '</time></div>'
                            content_1 = '<h1>' + item[
                                'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content3 + "</div>"
                            data = change_content(content_1,
                                                  self.start_urls[0])
                            item['Content'] = data[0]
                            # content2 = etree.HTML(item['Content'])
                            img_list = data[1]
                            get_pic(item, img_list)
                            item['Update_Tm'] = get_time_stamp()
                            item['Abstract'] = ''
                            item['URL'] = response.url
                            item['Web_Id'] = '5-24'
                            #print(item)
                            yield item

        if content != []:
            content = content.extract_first()
            title = response.xpath('//h1/text()')
            if title != []:
                title = title.extract_first()
                time1 = response.xpath('//section//span[@class="time"]/text()')
                if time1 != []:
                    item = EleIndustryItem()
                    item['News_Title'] = title
                    time1 = time1.extract_first().replace('年', '-').replace(
                        '月', '-').replace('日', '')
                    time1 = time1.strip().split(' ')[0]
                    time1 = time1 + ' ' + Get_Time()
                    item['News_Dt'] = time1
                    author2 = response.xpath(
                        '//div[@class="article-info art-share-layout m-share-layout clearfix"]/a'
                    ).xpath('string(.)').extract()
                    author = '电子发烧友网'
                    if author2 != []:
                        author = author2[0]
                        if author == '':
                            author = '电子发烧友网'
                    if author2 == ['']:
                        uid = response.xpath('//input[@id="webMID"]/@value')
                        if uid != []:
                            uid = uid.extract_first()
                            url = 'http://www.elecfans.com/webapi/member/getUserInfoNew/uid/{}'.format(
                                str(uid))
                            data = requests.get(url).text
                            try:
                                data = json.loads(data)['data']['writer_uname']
                                author = data
                            except Exception as E:
                                pass
                    item['Author'] = author
                    tags = response.xpath(
                        '//ul[@class="hot-main clearfix"]/li/text()').extract(
                        )
                    tag = ''
                    if tags != []:
                        tag = ','.join(tags).replace('\n', '').replace(
                            '\r', '').replace(' ', '').replace(',,',
                                                               ',').strip(',')
                    item['Keywords'] = tag
                    str_time = '<div class="explain"><span>' + item['Author'] + '</span><time>' + \
                               item['News_Dt'].split(' ')[0] + '</time></div>'
                    content1 = '<h1>' + item[
                        'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                    data = change_content(content1, self.start_urls[0])
                    item['Content'] = data[0]
                    # content2 = etree.HTML(item['Content'])
                    img_list = data[1]
                    get_pic(item, img_list)
                    item['Update_Tm'] = get_time_stamp()
                    item['Abstract'] = ''
                    item['URL'] = response.url
                    item['Web_Id'] = '5-24'
                    #print(item)
                    yield item
Ejemplo n.º 12
0
    def parse_item2(self, response):

        content = response.xpath('//div[@class="art-con article_body"]')
        if content != []:
            content = content.extract_first()
            title = response.xpath('//h1')
            if title != []:
                item = EleIndustryItem()
                title = title.xpath('string(.)').extract_first().strip()

                node1 = response.xpath('//div[@class="detailwarn"]')
                if node1 != []:
                    data = node1.xpath('string(.)').extract_first()
                    time1 = re.findall(r'\d+-\d+-\d+', data)
                    author = re.findall(r'作者:(.*)', data)
                    if time1 != []:
                        time1 = time1[0] + ' ' + Get_Time()
                        author1 = ''
                        if author != []:
                            author1 = author[0].strip()
                        abstract = response.xpath(
                            '//span[@class="art-lead-text"]/text()'
                        ).extract_first()
                        Keywords = response.xpath(
                            '//div[@class="art-relative-tags"]/a/text()'
                        ).extract()
                        Keywords = ','.join(Keywords)
                        item['News_Title'] = title
                        item['News_Dt'] = time1
                        item['Author'] = author1
                        item['Keywords'] = Keywords
                        item['Abstract'] = abstract
                        str_time = '<div class="explain"><span>' + item[
                            'Author'] + '</span><time>' + str(
                                item['News_Dt'].split(
                                    ' ')[0]) + '</time></div>'
                        content1 = '<h1>' + item[
                            'News_Title'] + '</h1>' + str_time + "<div class='content'>" + content + "</div>"
                        data = change_content(content1,
                                              'https://www.eet-china.com')
                        item['Content'] = data[0]
                        img_url2 = ''
                        img = response.xpath('//div[@class="cover-img"]/@style'
                                             ).extract_first()
                        if img is not None:
                            img = img.replace('(', '').replace(')', '')
                            img_url = re.search(r'url(.*)', img)
                            if img_url:
                                img_url1 = img_url.group(1)
                                img_url2 = urljoin('https://www.eet-china.com',
                                                   img_url1)
                        img_list = data[1]
                        if img_list == []:
                            if img_url2 != '':
                                img_list.append(img_url2)
                        get_pic(item, img_list)
                        item['Update_Tm'] = get_time_stamp()
                        item['URL'] = response.url
                        item['Web_Id'] = '5-38'
                        #print(item)
                        yield item