def parse(self, response):
        #print type(response)

        article = None
        try:
            article = NewsPlease.from_html(response.body.encode("utf-8"))
        except:
            article = NewsPlease.from_html(
                response.body.decode('latin-1').encode("utf-8"))
            print "EXCEPTION OCCURED"

        print article.date_publish
        #print article.text
        article2 = Article(url="", language="es")
        article2.set_html(response.body)
        article2.parse()

        print response.url
        self.db.articles_es.insert({
            "title": article.title,
            "pub_date": article.date_publish,
            "url": response.url,
            "content": article2.text,
            "raw_html": response.body
        })

        links = self.linkExtractor.extract_links(response)
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse)
Exemple #2
0
    def parse(self, response):
        try:
            article = NewsPlease.from_html(response.body, response.url)
            text = article.maintext
            if any(x in text.lower() for x in self.keywords):
                item = ArticleItem()
                item['title'] = article.title
                item['text'] = text
                item['url'] = response.url
                print('Saved', response.url)
                yield item
        except:
            pass

        # Get all the <a> tags
        a_selectors = response.xpath("//a")
        # print('SELECTORS', a_selectors)
        # Loop on each tag
        for selector in a_selectors:
            text = selector.xpath("text()").extract_first()
            link = selector.xpath("@href").extract_first()
            if link != None:
                if 'https://' not in link:
                    link = 'https://news.dartmouth.edu%s' % link
                # print(link)
                request = response.follow(link, callback=self.parse)
                # Return it thanks to a generator
                yield request
Exemple #3
0
def run_newsplease(htmlstring):
    '''try with newsplease'''
    try:
        article = NewsPlease.from_html(htmlstring, url=None)
        return article.maintext  # sanitize(article.maintext)
    except Exception as err:
        #print('Newsplease exception:', err)
        return ''
Exemple #4
0
    def crawl_page(self, response):
        self.crawl_other_links(response)

        article = NewsPlease.from_html(response.content, url=response.url)
        data = article.get_dict()
        data.pop('maintext')

        yield data
Exemple #5
0
def run_newsplease(htmlstring):
    '''try with newsplease'''
    try:
        article = NewsPlease.from_html(htmlstring, url=None)
        if article.date_publish is None:
             return None
        date = convert_date(article.date_publish, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d')
        return date
    except Exception as err:
        print('Exception:', err)
        return None
def main():
    output = {}
    for path in Path('html').glob('*.html.gz'):
        with gzip.open(path, 'rt', encoding='utf8') as f:
            html = f.read()
        item_id = path.stem.split('.')[0]
        article = NewsPlease.from_html(html, url=None)
        output[item_id] = {'articleBody': article.maintext}
    (Path('output') / 'news_please.json').write_text(json.dumps(
        output, sort_keys=True, ensure_ascii=False, indent=4),
                                                     encoding='utf8')
Exemple #7
0
def get_paragraphs_newsplease(str_text, mode):
  """
  using Newsplease
  """
  try:
    text_det = NewsPlease.from_html(str_text.encode(), url=None).maintext
    if text_det is None:
      list_paragraphs = [""]
    else:
      list_paragraphs = re.split("\n", text_det)
  except:
    list_paragraphs = [""]
  return list_paragraphs
Exemple #8
0
def extract(html, url):
    try:
        article = NewsPlease.from_html(html, url=None)
    except newspaper.article.ArticleException as e:
        logger_e.info("{} - {}".format(url, e))
        return {}
    except ValueError as e:
        logger_e.info("{} - {}".format(url, e))
        return {}
    except:
        logger_e.info("{} - {}".format(url, "Unknown error"))
        return {}
    return {
        "title": article.title,
        "maintext": article.maintext,
        "language": article.language,
    }
Exemple #9
0
    def parse(self, response):
        now = datetime.datetime.now()

        article = NewsPlease.from_html(response.text, response.url)
        if article.date_publish is not None and article.text is not None:
            yield NewsEntry(
                full_url=response.url,
                source_domain = article.source_domain,
                date_publish = article.date_publish,
                date_download = str(now),
                title = article.title,
                description = article.description,
                text = article.text,
            #    dont_filter=True
            )

        for link in LxmlLinkExtractor(allow=self.allowed_domains).extract_links(response):
            yield Request(link.url, self.parse)
    def save_page(self, response):
        # ignore 404s
        if response.status == 404:
            return

        # # make the parent directory
        # url_parts = response.url.split('://')[1].split('/')
        # parent_directory = os.path.join(self.directory, *url_parts)
        # os.makedirs(parent_directory, exist_ok=True)

        # # construct the output filename
        # time = response.meta['wayback_machine_time']
        # if self.unix:
        #     filename = '{0}.snapshot'.format(time.timestamp())
        # else:
        #     filename = '{0}.snapshot'.format(time.strftime(WaybackMachineMiddleware.timestamp_format))
        # full_path = os.path.join(parent_directory, filename)

        # # write out the file
        # with open(full_path, 'wb') as f:
        #     f.write(response.body)

        try:
            # check to make sure I don't already have it
            if bool(db.articles.find_one({'url': response.url})):
                return
            # if I don't, insert
            article = NewsPlease.from_html(response.body, response.url,
                                           datetime.today()).__dict__
            if article['date_publish'] and article['title']:
                article['download_via'] = 'wayback'
                # insert to db
                db.insert_one(article)
                print('inserted ' + article['url'])
        except:
            traceback.print_exc
            print(article['url'])