def parse_news_page(self, response):
     title = str(response.css('h1.title::attr(title)').get()).strip()
     regex = re.compile(r'[\n\r\t]|  +')
     description = (regex.sub(
         "", str(response.css('div.description p span strong::text').get()))
                    ).strip()
     full_text_array = response.css('div.description *::text').getall()
     full_text = (' '.join(str(d).strip() for d in full_text_array))
     full_text = (regex.sub("", full_text))
     if description is None or description == 'None' or description == '':
         description = full_text[0:][:100]
     datatime = response.css(
         'div.date time::attr(datetime)').get()  # '2020-05-29 12:31:00'
     newsUrl = response.request.url
     if len(description) < 100:
         description = description + ' ' + full_text[0:][:100]
     if title != 'None':
         item = ScrapyNewsItem()
         item['title'] = title.strip().replace(';', ' ')
         item['description'] = description.replace(';', ' ')
         item['full_text'] = full_text.replace(';', ' ')
         item['link'] = newsUrl
         item['pub_date'] = datatime
         universityItem = University(id=self.universityId)
         item['university'] = universityItem
         yield item
 def parse_news_page(self, response):
     title = response.css('div.news_name::text').get()
     description = str(
         response.css('div.preview_text ::text').get()).strip()
     if description == '':
         description = str(
             response.css('div.preview_text p::text').get()).strip()
     full_text_array = response.css('div.preview_text *::text').getall()
     full_text = (' '.join(str(d).strip() for d in full_text_array))
     regex = re.compile(r'[\n\r\t]|  +')
     full_text = (regex.sub("", full_text))
     if description is None or description == 'None' or description == '':
         description = full_text[0:][:100]
     datatime_ruw = response.css(
         'div.news-detail-date::text').get().strip()  # '08.07.2020'
     datatime = str(datetime.datetime.strptime(datatime_ruw, '%d.%m.%Y'))
     newsUrl = response.request.url
     if len(description) < 100:
         description = description + ' ' + full_text[0:][:100]
     if full_text != '':
         item = ScrapyNewsItem()
         item['title'] = title.strip().replace(';', ' ')
         item['description'] = description.replace(';', ' ')
         item['full_text'] = full_text.replace(';', ' ')
         item['link'] = newsUrl
         item['pub_date'] = datatime
         universityItem = University(id=self.universityId)
         item['university'] = universityItem
         yield item
 def parse_news_page(self, response, item):
     # title = response.css('div.article article h1::text').get()
     description_array = response.css(
         'div.article article strong::text').getall()
     description = ' '.join(str(d) for d in description_array)
     regex = re.compile(r'[\n\r\t]|  +')
     description = (regex.sub("", description))
     full_text_array = response.css('div.post-content *::text').getall()
     full_text = (' '.join(str(d).strip() for d in full_text_array))
     full_text = (regex.sub("", full_text))
     datatime_ruw = response.css(
         'div.news-info-wrapper time::attr(datetime)').get().strip(
         )  # 2020-06-04T08:39:24-04:00
     datatime = str(
         datetime.datetime.strptime(datatime_ruw,
                                    '%Y-%m-%dT%H:%M:%S%z'))[0:][:19]
     newsUrl = response.request.url
     if len(description) < 100:
         description = description + ' ' + full_text[0:][:100]
     item['title'] = item['title'].replace(';', ' ')
     item['description'] = description.replace(';', ' ')
     item['full_text'] = full_text.replace(';', ' ')
     item['link'] = newsUrl
     item['pub_date'] = datatime
     universityItem = University(id=self.universityId)
     item['university'] = universityItem
     yield item
 def parse_news_page(self, response, item):
     # title = str(response.css('div.news-title hgroup.section-header h1::text').get()).strip()
     description = str(response.css('p.news-excerpt::text').get()).strip()
     full_text_array = response.css(
         'div.main.parsys div div *::text').getall()
     full_text = (' '.join(str(d).strip() for d in full_text_array))
     regex = re.compile(r'[\n\r\t]|  +')
     full_text = (regex.sub("", full_text))
     date_day = str(
         response.css('span.publication-day::text').get()).strip()
     date_year = str(
         response.css('span.publication-year::text').get()).strip()
     datatime_ruw = date_day + ' ' + date_year  # Jul 14 2020
     datatime = '2000-01-01 00:00:00'
     if not (date_day == date_year == 'None'):
         datatime = str(datetime.datetime.strptime(datatime_ruw,
                                                   '%b %d %Y'))
     newsUrl = response.request.url
     if len(description) < 100:
         description = description + ' ' + full_text[0:][:100]
     if not (description == 'None ' and full_text == ''
             and datatime == '2000-01-01 00:00:00'):
         item['title'] = item['title'].replace(';', ' ')
         item['description'] = description.replace(';', ' ')
         item['full_text'] = full_text.replace(';', ' ')
         item['link'] = newsUrl
         item['pub_date'] = datatime
         universityItem = University(id=self.universityId)
         item['university'] = universityItem
         yield item
 def parse_news_page(self, response):
     title = response.css(
         'div.article-titles__titles h1.article-titles__title::text').get()
     description_array = response.css(
         'div.article-body.basic-text p::text').getall()
     description = (' '.join(str(d) for d in description_array))
     regex = re.compile(r'[\n\r\t]|  +')
     description = (regex.sub("", description)).strip()
     if description is None or description == 'None' or description == '':
         description_array = response.css(
             'div.article-body.basic-text p span::text').getall()
         description = (' '.join(str(d) for d in description_array))
         description = (regex.sub("", description)).strip()
     full_text_array = response.css(
         'div.article-body.basic-text *::text').getall()
     full_text = (' '.join(str(d).strip() for d in full_text_array))
     full_text = (regex.sub("", full_text))
     datatime_ruw = response.css(
         'p.article-posted-on time.timestamp::attr(datetime)').get().strip(
         )  # 2020-06-04T08:39:24-04:00
     datatime = str(
         datetime.datetime.strptime(datatime_ruw,
                                    '%Y-%m-%dT%H:%M:%S%z'))[0:][:19]
     newsUrl = response.request.url
     if len(description) < 100:
         description = description + ' ' + full_text[0:][:100]
     item = ScrapyNewsItem()
     item['title'] = title.strip().replace(';', ' ')
     item['description'] = description.replace(';', ' ')
     item['full_text'] = full_text.replace(';', ' ')
     item['link'] = newsUrl
     item['pub_date'] = datatime
     universityItem = University(id=self.universityId)
     item['university'] = universityItem
     yield item
 def parse_news_page(self, response):
     title = str(response.css('h1.page-header span::text').get()).strip()
     regex = re.compile(r'[\n\r\t]|  +')
     description = (regex.sub(
         "",
         str(
             response.css(
                 'div.field.field--name-field-caption-format.field--type-text-long.'
                 'field--label-hidden.landingimg-caption.field--item  p::text'
             ).get())))
     full_text_array = response.css('div.content *::text').getall()
     full_text = (' '.join(str(d).strip() for d in full_text_array))
     full_text = (regex.sub("", full_text))
     datatime = '2000-01-01 01:01:01'
     array = response.css(
         'article.highlights.full.clearfix *::text').getall()
     array_press_releases = response.css(
         'article.press-releases-1.full.clearfix *::text').getall()
     if array:
         datatime_ruw = str(array[1]).strip()  # '03 July 2020'
         if len(datatime_ruw) > 3:
             datatime_ruw = datatime_ruw[0:][:-3]
             datatime = str(
                 datetime.datetime.strptime(datatime_ruw, '%d %B %Y'))
     if datatime == '2000-01-01 01:01:01':
         if array_press_releases:
             datatime_ruw = str(
                 array_press_releases[1]).strip()  # '03 July 2020'
             if len(datatime_ruw) > 3:
                 datatime_ruw = datatime_ruw[0:][:-3]
                 datatime = str(
                     datetime.datetime.strptime(datatime_ruw, '%d %B %Y'))
     newsUrl = response.request.url
     if len(description) < 100:
         description = description + ' ' + full_text[0:][:100]
     if not (title == 'None' and description == 'None '
             and full_text == ''):
         item = ScrapyNewsItem()
         item['title'] = title.strip().replace(';', ' ')
         item['description'] = description.replace(';', ' ')
         item['full_text'] = full_text.replace(';', ' ')
         item['link'] = newsUrl
         item['pub_date'] = datatime
         universityItem = University(id=self.universityId)
         item['university'] = universityItem
         yield item
    def parse_news_page(self, response):
        title = str(response.css('h1.cam-sub-title::text').get()).strip()
        description_array = response.css(
            'div.field-name-field-content-summary div.field-items div.field-item.even *::text'
        ).getall()
        description = ' '.join(str(d).strip() for d in description_array)
        full_text_array = response.css(
            'div.field-name-body div.field-items div.field-item.even *::text'
        ).getall()
        full_text = ' '.join(str(d).strip() for d in full_text_array)
        if full_text == '':
            full_text_array = response.css(
                'div.field-name-body div.field-items div.field-item.even p *::text'
            ).getall()
            full_text = ' '.join(str(d).strip() for d in full_text_array)
        regex = re.compile(r'[\n\r\t]|  +')
        full_text = (regex.sub("", full_text))
        newsUrl = response.request.url
        datatime = '2000-01-01 01:01:01'
        datatime_ruw = str(
            response.css(
                'div.view-content div.views-row div.view-image-credit span::text'
            ).get()).strip()  # '05 June 2020'
        if datatime_ruw != 'None' and datatime_ruw != '' and datatime_ruw != None:
            datatime = str(datetime.datetime.strptime(datatime_ruw,
                                                      '%d %b %Y'))
        if len(description) < 100 and full_text != '':
            description = description + ' ' + full_text[:100]
        elif len(full_text) < 100:
            full_text = full_text + ' ' + description[:100]

        if not (title == 'None' and description == ' ' and full_text == ''):
            item = ScrapyNewsItem()
            item['title'] = title.strip().replace(';', ' ')
            item['description'] = description.replace(';', ' ')
            item['full_text'] = full_text.replace(';', ' ')
            item['link'] = newsUrl
            item['pub_date'] = datatime
            universityItem = University(id=self.universityId)
            item['university'] = universityItem
            yield item
 def parse_news_page(self, response):
     title = response.css(
         'div.simple-news-header-block.mb-5.py-5 h1.simple-news-header-block__title.mb-3::text'
     ).get()
     if title is None or title == 'None' or title == '':
         title = response.css(
             'h1.news-hero-header-block__info__title::text').get()
     description = response.css('div.rich-text p::text').get()
     if description is None or description == 'None' or description == '':
         description = str(response.css('div.rich-text p span::text').get())
         if description is None or description == 'None' or description == '':
             description = str(
                 response.css(
                     'div.video-block__info__caption.px-4::text').get())
             if description is None or description == 'None' or description == '':
                 description_array = response.css(
                     'div.rich-text div::text').getall()
                 description = ' '.join(str(d) for d in description_array)
     description = description.strip()
     regex = re.compile(r'[\n\r\t]|  +')
     description = (regex.sub("", description)).strip()
     datatime_ruw = response.css('div.publish-date-block__date::text').get(
     ).strip()  # 'July 16, 2020'
     datatime = str(datetime.datetime.strptime(datatime_ruw, '%B %d, %Y'))
     newsUrl = response.request.url
     full_text_array = response.css('div.rich-text *::text').getall()
     full_text = ' '.join(str(d).strip() for d in full_text_array)
     full_text = (regex.sub("", full_text))
     if len(description) < 100:
         description = description + ' ' + full_text[0:][:100]
     if description != "":
         item = ScrapyNewsItem()
         item['title'] = title.strip().replace(';', ' ')
         item['description'] = description.replace(';', ' ')
         item['full_text'] = full_text.replace(';', ' ')
         item['link'] = newsUrl
         item['pub_date'] = datatime
         universityItem = University(id=self.universityId)
         item['university'] = universityItem
         yield item
 def parse_news_page(self, response):
     title = response.css('div.page-header h2::text').get().strip()
     full_text_array = response.css(
         'div[itemprop = "articleBody"] *::text').getall()
     full_text = (' '.join(str(d).strip() for d in full_text_array))
     regex = re.compile(r'[\n\r\t]|  +')
     full_text = (regex.sub("", full_text))
     datatime_ruw = response.css('dd.published time::attr(datetime)').get(
     ).strip()  # '2020-07-15T16:30:00+03:00'
     datatime = str(
         datetime.datetime.strptime(datatime_ruw,
                                    '%Y-%m-%dT%H:%M:%S%z'))[0:][:19]
     newsUrl = response.request.url
     description = full_text[0:][:100]
     item = ScrapyNewsItem()
     item['title'] = title.strip().replace(';', ' ')
     item['description'] = description.replace(';', ' ')
     item['full_text'] = full_text.replace(';', ' ')
     item['link'] = newsUrl
     item['pub_date'] = datatime
     universityItem = University(id=self.universityId)
     item['university'] = universityItem
     yield item