def process_item(self, html_page): try: publish_date = examine(html_page['html']) from boilerpipe.extract import Extractor extractor = Extractor(extractor='ArticleExtractor', html=html_page['html']) body = str(extractor.getText()) title = str(extractor.source.getTitle()) art = { 'title': title, 'body': body, 'lang': self.lang, 'source': html_page['source'], 'url': html_page['url'], 'crawl_date': html_page['timestamp'], 'publish_date': publish_date, 'article_id': sha1(html_page['url'].encode('utf-8')).hexdigest(), 'sentences': [] } if self.art_ok(art['body']): content = art['body'] content = content.replace(u'\xa0', u' ') content = content.replace('\\n', '\n') sents = [] if self.lang == 'en': sents = sent_tokenize(content) else: for para in content.split('\n'): sents += sentence_split(para, self.lang) sents = [sent for sent in sents if self.check_sent(sent)] art['sentences'] = sents if len(sents) >= 3: self.output_corpus.add_instance(art) except Exception as e: pass
def test_download(): '''test page download''' assert examine(' ', False) is None assert examine('0'*int(10e7), False) is None assert fetch_url('https://httpbin.org/status/404') is None url = 'https://httpbin.org/status/200' teststring = fetch_url(url) assert teststring is None assert examine(teststring) is None url = 'https://httpbin.org/links/2/2' teststring = fetch_url(url) assert teststring is not None assert examine(teststring) is None url = 'https://httpbin.org/html' teststring = fetch_url(url) assert teststring is not None assert examine(teststring, False) is None
def test_download(): '''test page download''' #assert fetch_url('https://www.iana.org/404') is None #assert fetch_url('https://www.google.com/blank.html') is None #assert fetch_url('https://blank.org') is None assert fetch_url('https://httpbin.org/status/404') is None url = 'https://httpbin.org/status/200' teststring = fetch_url(url) assert teststring is None assert examine(teststring) is None url = 'https://httpbin.org/links/2/2' teststring = fetch_url(url) assert teststring is not None assert examine(teststring) is None url = 'https://httpbin.org/html' teststring = fetch_url(url) assert teststring is not None assert examine(teststring, False) is None
def test_cli(): '''test the command-line interface''' assert examine(' ', extensive_bool=True) is None assert examine('0'*int(10e7), extensive_bool=True) is None assert examine('<html><body><span class="entry-date">12. Juli 2016</span></body></html>', True) == '2016-07-12' assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True) == '2016-07-12' assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True, maxdate='2015-01-01') is None assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True, maxdate='2017-12-31') == '2016-07-12' assert examine('<html><body>2016-07-12</body></html>', extensive_bool=True, maxdate='2017-41-41') == '2016-07-12'
def test_cli(): '''test the command-line interface''' assert examine(' ', True) is None assert examine('0'*int(10e7), True) is None assert examine('<html><body><span class="entry-date">12. Juli 2016</span></body></html>', True) == '2016-07-12' assert examine('<html><body>2016-07-12</body></html>', True) == '2016-07-12'