Exemple #1
0
def get_article(article_link, summary_length=5):
    '''
    Extract article and summarize it
    '''
    article = Article(article_link)
    article.build()
    return article
Exemple #2
0
 def test_chinese_fulltext_extract(self):
     url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'
     article = Article(url=url, language='zh')
     article.build()
     # assert isinstance(article.stopwords_class, StopWordsChinese)
     with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f:
         assert article.text == f.read()
class NewsSearch:
    def __init__(self, searchterm):
        self.searchterm = searchterm
        self.texts = []
        self.links = []
        self.date = []
        self.art = []
        self.GoogleSearch()
    
    def GoogleSearch(self):
        self.url="https://news.google.com/rss/search?q=" + self.searchterm + "&hl=en-US&gl=US&ceid=US%3Aen"
        Client=urlopen(self.url)
        xml_page=Client.read()
        Client.close()
        soup_page=soup(xml_page,"xml")
        news_list=soup_page.findAll("item")
        # Print news title, url and publish date
        for news in news_list:
          self.texts.append(news.title.text)
          self.links.append(news.link.text)
          self.date.append(news.pubDate.text)
          
        for link in self.links:
            self.article = Article(link)
            try:
                self.article.build()
                self.article.nlp()
            except:
                pass
            finally:
                self.art.append(self.article)
                
Exemple #4
0
 def test_chinese_fulltext_extract(self):
     url = 'http://news.sohu.com/20050601/n225789219.shtml'
     mock_response_with(url, 'chinese_article')
     article = Article(url=url, language='zh')
     article.build()
     with codecs.open(os.path.join(TEXT_FN, 'chinese.txt'),
                      'r', 'utf8') as f:
         assert article.text == f.read()
def get_article_from_url(url):
    new_article = NewsItem(url=url)
    new_article.download()
    new_article.build()
    new_article.nlp()
    #print(new_article.__dict__)
    #print(new_article)
    return new_article
Exemple #6
0
 def test_chinese_fulltext_extract(self):
     url = 'http://news.sohu.com/20050601/n225789219.shtml'
     mock_response_with(url, 'chinese_article')
     article = Article(url=url, language='zh')
     article.build()
     with codecs.open(os.path.join(TEXT_FN, 'chinese.txt'), 'r',
                      'utf8') as f:
         assert article.text == f.read()
Exemple #7
0
    def test_arabic_fulltext_extract(self):
        url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html'

        article = Article(url=url)
        article.build()
        assert article.meta_lang == 'ar'
        # assert isinstance(article.stopwords_class, StopWordsArabic)
        with codecs.open(os.path.join(TEXT_FN, 'arabic_text_1.txt'), 'r', 'utf8') as f:
            assert article.text == f.read()
def test():
    url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    try:
        a = Article(url)
        a.build()
        process_and_save_article(a, 'bbc')
    except:
        print ("error detected")
Exemple #9
0
 def test_spanish_fulltext_extract(self):
     url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal'\
           'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
     mock_response_with(url, 'spanish_article')
     article = Article(url=url, language='es')
     article.build()
     with codecs.open(os.path.join(TEXT_FN, 'spanish.txt'), 'r',
                      'utf8') as f:
         assert article.text == f.read()
Exemple #10
0
 def test_spanish_fulltext_extract(self):
     url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal'\
           'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
     mock_response_with(url, 'spanish_article')
     article = Article(url=url, language='es')
     article.build()
     with codecs.open(os.path.join(TEXT_FN, 'spanish.txt'),
                      'r', 'utf8') as f:
         assert article.text == f.read()
Exemple #11
0
 def test_arabic_fulltext_extract(self):
     url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/'\
           'index.html'
     mock_response_with(url, 'arabic_article')
     article = Article(url=url)
     article.build()
     assert article.meta_lang == 'ar'
     with codecs.open(os.path.join(TEXT_FN, 'arabic.txt'),
                      'r', 'utf8') as f:
         assert article.text == f.read()
Exemple #12
0
def _retrive_content(url):
    article = Article(url)
    success = False
    try:
        article.build()
        success = True
    except ArticleException as e:
        sterr.write(e)
    finally:
        return article, success
Exemple #13
0
def _retrive_content(url):
    article = Article(url)
    success = False
    try:
        article.build()
        success = True
    except ArticleException as e:
        sterr.write(e)
    finally:
        return article, success
Exemple #14
0
 def test_arabic_fulltext_extract(self):
     url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/'\
           'index.html'
     mock_response_with(url, 'arabic_article')
     article = Article(url=url)
     article.build()
     assert article.meta_lang == 'ar'
     with codecs.open(os.path.join(TEXT_FN, 'arabic.txt'), 'r',
                      'utf8') as f:
         assert article.text == f.read()
Exemple #15
0
 def article_list(self):
     for link in self.links:
         art = Article(link)
         try:
             art.build()
             art.nlp()
         except:
             pass
         finally:
             self.art_list.append(art)
def test():
    #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    a = Article(url)
    a.build()

    loc = get_news_location(a, num_of_location=3)
    print (loc)

    
    try:
        print (detect(a.text))
    except lang_detect_exception.LangDetectException:
        print ("Not English")
def test_save_article_function():
    from newspaper import Article
    today = time.time()
    today = datetime.datetime.fromtimestamp(today)
    url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    a = Article(url)
    a.build()
    #print (a.title, a.publish_date)

    #if the news has no publish_date, set it to today
    if a.publish_date is None:
        a.publish_date = today

    path_to_save = get_path_to_save(a)
    data_a = get_serialized_article_obj(a)
    create_file(path_to_save, data = data_a)
def get_actual_url(x):
    global count
    count = count + 1
    print(count)
    try:
        a = Article(x)
        a.build()
    except:
        return x
    else:
        if a.meta_data['url']:
            return a.meta_data['url']
        else:
            try:
                return a.meta_data['og']['url']
            except:
                return x
Exemple #19
0
 def build_articles(self, links):
     '''create document'''
     with open(self.name, "w") as document:
         with open(self.bad_links, 'a+') as bad_links:
             for link in links:
                 if link not in bad_links:
                     try:
                         article = Article(link)
                         article.build()
                         self.content(article, document)
                         self.success += 1
                     except:
                         self.error += 1
                         links.remove(link)
                         bad_links.write(link)
                         continue
     print(self.count)
     print(self.x)
     return links
def getArticle(url, company, sec_code):
    try:
        print(company, sec_code)
        article = Article(url)
        article.download()
        article.build()
        article.parse()
        article.nlp()
        ans = {}
        hsh = hashlib.md5(article.title.encode())
        hsh = hsh.hexdigest()
        ans['_id'] = str(hsh)
        ans['title'] = str(article.title)
        ans['summary'] = str(article.summary).replace('\n', '')
        if article.publish_date == None:
            ans['publish_date'] = str(datetime.now().date())
            ans['publish_time'] = str(datetime.now().time())
        else:
            ans['publish_date'] = str(article.publish_date.date())
            ans['publish_time'] = str(article.publish_date.time())
        ans['authors'] = article.authors
        ans['source'] = str(article.source_url)
        ans['company'] = company
        ans['Security'] = sec_code
        ans['category'] = 'news'
        ans['keywords'] = article.keywords
        sd = []
        st = []
        try:
            matches = datefinder.find_dates(article.summary)
            for match in set(matches):
                sd.append(str(match.date()))
                st.append(str(match.time()))
        except:
            pass
        ans['story_dates'] = sd
        ans['story_time'] = st

        news.append(ans)
        insert_into_db(ans)
    except:
        pass
    print("Success - " + " " + url)
Exemple #21
0
    def newspaper_parser(self, sleep_time=0):
        print('41 running newspaper_parser()...')

        results = []
        count = 0
        #print(self.links)
        for l in self.links:

            article = Article(url=l)
            try:
                article.build()
                print(article.summary)
            except:
                time.sleep(60)
                continue

            data = {
                'title': article.title,
                'date_published': article.publish_date,
                'news_outlet': self.newspaper,
                'authors': article.authors,
                'feature_img': article.top_image,
                'article_link': article.canonical_link,
                'keywords': article.keywords,
                'movies': article.movies,
                'summary': article.summary,
                'text': article.text,
                'html': article.html
            }

            print(data['title'])
            #print(data['publish_date'])
            #print (data['text'])
            #print("")
            print("")
            results.append(data)

            count += 1
            #print (count)
            time.sleep(sleep_time)

        return results
def test():
    #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    #url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    url = 'http://www.nytimes.com/2016/03/19/world/europe/dubai-airliner-crashes-while-trying-to-land-at-russian-airport.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news&_r=1'
    print ("building:", url)
    a = Article(url)
    a.build()
    process_and_save_article(a)

    print ("first paragraph")
    print (a.text.split('\n')[0])
    print ("Summary:")
    print (a.summary)   
    
    try:
        print (detect(a.text))
    except lang_detect_exception.LangDetectException:
        print ("Not English")
Exemple #23
0
    def build_news_article_from_url(source_url, sNLP):
        """build new article object from source url, if build fail would return None
        """
        try:
            print('start to scrape from url: ', source_url)

            # pre-process news by NewsPaper3k and Boilerpipe library
            article = Article(source_url, keep_article_html=True)
            article.build()
            article.nlp()
            e = Extractor(extractor='DefaultExtractor', html=article.html)
            article.text = e.getText()
            article.article_html = e.getHTML()

            news_article = NewsArticle(article, sNLP)
            print('success to scrape from url: ', source_url)
            return news_article
        except Exception as e:
            print('fail to scrape from url: ', source_url)
            print('reason:', e)
            return None
def aggregate():
    ArticleRec.objects.filter(
        article_published__lte=datetime.datetime.today() -
        datetime.timedelta(days=7)).delete()

    for f in shuffle(FeedRec.objects.all()):

        u = f.feed_url

        print(u)
        article_list = grab_rss(f)

        x = 0

        for a in article_list:
            x += 1
            print("Checking article: " + str(x))

            article = Article(url=a.url)

            try:
                article.build()
            except (ArticleException, UnicodeDecodeError, ValueError):
                print("Error: ArticleException")
                continue

            a.content = parser.parse(article.text)['text']
            print(len(a.content))
            if len(a.content) < 50:
                print("Error: Too short")
                continue

            a.tag = clf.predict([article.text])[0]

            width, height = get_image_size(article.top_image)

            if width > 100 or height > 100:
                a.img = article.top_image
            add_article(a)
def aggregate():
    ArticleRec.objects.filter(article_published__lte= datetime.datetime.today()-datetime.timedelta(days=7)).delete()

    for f in shuffle(FeedRec.objects.all()):

        u = f.feed_url

        print(u)
        article_list = grab_rss(f)

        x = 0

        for a in article_list:
            x += 1
            print("Checking article: " + str(x))

            article = Article(url=a.url)

            try:
                article.build()
            except (ArticleException, UnicodeDecodeError, ValueError):
                print("Error: ArticleException")
                continue

            a.content = parser.parse(article.text)['text']
            print(len(a.content))
            if len(a.content) < 50:
                print("Error: Too short")
                continue

            a.tag = clf.predict([article.text])[0]

            width, height = get_image_size(article.top_image)

            if width > 100 or height > 100:
                a.img = article.top_image
            add_article(a)
Exemple #26
0
def test():
    #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    #url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    urls = []
    #urls.append('')
    urls.append('http://www.bbc.com/news/world-australia-35800175')
    #urls.append('http://edition.cnn.com/2016/03/21/politics/bernie-sanders-wins-democrats-abroad/index.html')
    #urls.append('http://www.huffingtonpost.com/jonathan-greenberg/three-reasons-bernie-sand_b_9538508.html')
    #urls.append('http://ewn.co.za/2016/03/25/Nigeria-targets-300-army-officers-and-firms-in-widening-corruption-probe')
    for url in urls:
        print ("building:", url)
        a = Article(url)
        a.build()
        process_and_save_article(a)
    

    
    
    try:
        print (detect(a.text))
    except lang_detect_exception.LangDetectException:
        print ("Not English")
Exemple #27
0
def link_parser(link):
    parsed_uri = urlparse(link)
    source = '{uri.netloc}'.format(uri=parsed_uri)
    domain = extract(link).domain

    article = Article(link)
    article.build()
    try:
        full_text = article.text
    except:
        full_text = None
        pass

    image = article.top_image
    keywords = article.keywords
    summary = article.summary
    title = article.title

    try:
        published_at = extractArticlePublishedDate(link)
    except Exception as e:
        published_at = None
        print(e)
        print("\n\n\n")
        pass

    try:
        language = article.meta_lang
    except:
        language = None
        pass

    try:
        author = article.authors
    except:
        author = None
        pass
    """
    places = get_location.get_place_context(text=description)

    location = {
        "countries": places.countries,
        "country_mentions" : places.country_mentions,
        "cities" : places.cities,
        "city_mentions" : places.city_mentions
    }
    """

    if image != "" and full_text != "" and title != "":
        dic = {
            'url': link,
            'im': image,
            'title': title,
            'domain': domain,
            'full_text': full_text,
            'summary': summary,
            'keywords': keywords,
            'source': source,
            'published_at': published_at,
            'language': language,
            'author': author
        }
        print('done')
        return dic
Exemple #28
0
class ArticleTestCase(unittest.TestCase):
    def runTest(self):
        self.test_url()
        self.test_download_html()
        self.test_pre_download_parse()
        self.test_parse_html()
        self.test_meta_type_extraction()
        self.test_meta_extraction()
        self.test_pre_parse_nlp()
        self.test_nlp_body()

    def setUp(self):
        """called before the first test case of this unit begins
        """
        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch')

    def tearDown(self):
        """Called after all test cases finish of this unit
        """
        pass

    @print_test
    def test_url(self):
        assert self.article.url == (
            u'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html')

    @print_test
    @responses.activate
    def test_download_html(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.download()
        assert len(self.article.html) == 75244

    @print_test
    def test_pre_download_parse(self):
        """Before we download an article you should not be parsing!
        """
        article = Article(self.article.url)

        def failfunc():
            article.parse()

        self.assertRaises(ArticleException, failfunc)

    @print_test
    @responses.activate
    def test_parse_html(self):
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        DOMAIN = 'www.cnn.com'
        SCHEME = 'http'
        AUTHORS = ['Dana Ford', 'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()
        with open(os.path.join(TEXT_FN, 'cnn.txt'), 'r') as f:
            assert self.article.text == f.read()
        assert self.article.top_img == TOP_IMG
        assert self.article.authors == AUTHORS
        assert self.article.title == TITLE
        assert len(self.article.imgs) == LEN_IMGS
        assert self.article.meta_lang == META_LANG

    @print_test
    @responses.activate
    def test_meta_type_extraction(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()

        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        assert 'article' == meta_type

    @print_test
    @responses.activate
    def test_meta_extraction(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()

        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(
            dict, {
                'medium':
                'news',
                'googlebot':
                'noarchive',
                'pubdate':
                '2013-11-27T08:36:32Z',
                'title':
                'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
                'og': {
                    'site_name': 'CNN',
                    'description':
                    'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.',
                    'title':
                    'After storm, forecasters see smooth sailing for Thanksgiving',
                    'url':
                    'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                    'image':
                    'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg',
                    'type': 'article'
                },
                'section':
                'travel',
                'author':
                'Dana Ford and Tom Watkins, CNN',
                'robots':
                'index,follow',
                'vr': {
                    'canonical':
                    'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'
                },
                'source':
                'CNN',
                'fb': {
                    'page_id': 18793419640,
                    'app_id': 80401312489
                },
                'keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
                'article': {
                    'publisher': 'https://www.facebook.com/cnninternational'
                },
                'lastmod':
                '2013-11-28T02:03:23Z',
                'twitter': {
                    'site': {
                        'identifier': '@CNNI',
                        'id': 2097571
                    },
                    'card': 'summary',
                    'creator': {
                        'identifier': '@cnntravel',
                        'id': 174377718
                    }
                },
                'viewport':
                'width=1024',
                'news_keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
            })

        assert meta == META_DATA

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = filter(lambda v: isinstance(v, dict), meta.values())
        assert all(map(lambda d: len(d) > 0, dict_values))

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        assert len(filter(is_dict, meta.values())) == 5

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, types.StringTypes)
        assert len(filter(is_string, meta.values())) == 12

    @print_test
    @responses.activate
    def test_pre_download_nlp(self):
        """Test running NLP algos before even downloading the article
        """
        mock_response_with(self.article.url, 'cnn_article')

        def failfunc():
            self.article.nlp()

        self.assertRaises(ArticleException, failfunc)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLP algos before parsing the article
        """
        article = Article(self.article.url)
        article.download()

        def failfunc():
            article.nlp()

        self.assertRaises(ArticleException, failfunc)

    @print_test
    @responses.activate
    def test_nlp_body(self):
        SUMMARY = """Wish the forecasters were wrong all the time :)"Though the worst of the storm has passed, winds could still pose a problem.\r\nForecasters see mostly smooth sailing into Thanksgiving.\r\nThe forecast has left up in the air the fate of the balloons in Macy's Thanksgiving Day Parade.\r\nThe storm caused some complications and inconveniences, but no major delays or breakdowns.\r\n"That's good news for people like Latasha Abney, who joined the more than 43 million Americans expected by AAA to travel over the Thanksgiving holiday weekend."""

        KEYWORDS = [
            u'great', u'good', u'flight', u'sailing', u'delays', u'smooth',
            u'thanksgiving', u'snow', u'weather', u'york', u'storm', u'winds',
            u'balloons', u'forecasters'
        ]

        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()
        self.article.nlp()
        # print self.article.summary
        # print self.article.keywords
        assert self.article.summary == SUMMARY
        assert self.article.keywords == KEYWORDS
Exemple #29
0
#크롤링할 url 주소 입력
# url = 'http://v.media.daum.net/v/20170604205121164'
# url = "https://m.blog.naver.com/heerok93/221076782232"
# url = "https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html"
url = "http://sports.news.naver.com/wfootball/schedule/index.nhn"
#===========================================
#언어가 한국어이므로 language='ko'로 설정
a = Article(url, language='ko')
a.download()
a.parse()
#기사 제목 가져오기
print(a.title)
#기사 내용 가져오기(150자)
# print(a.text)
with open('newspp.html', 'w') as f:
    # text = fulltext(a.html)
    f.write(a.html)

a.build()
with open('newspp_1.html', 'w') as f:
    # text = fulltext(a.html)
    f.write(a.html)

print(a.publish_date)
print(a.images)
#===========================================
# nb = newspaper.build(url)
#
# for article in nb.category_urls():
#     print(article)
#===========================================
    def newspaper_parser(self, newspaper, links, topic, sleep_time=2):
        print("[System]: newspaper_parser Activated")

        results = []
        count = 0
        # links = ['https://www.ynetnews.com/article/H1zKfsc9L']
        for l in links:
            article = Article(url=l)
            try:
                article.build()
            except Exception as e:
                print("Error 75:", e)
                time.sleep(10)
                continue
            date = article.publish_date.strftime("%d/%m/%Y")
            if self.dateStart:
                if self.check_dates(date):
                    print("[System]: date is ok")
                else:
                    print("[System]: date is out of range")
                    continue
            if newspaper == "n12":
                authors = self.findN12Authors(l)
            else:
                authors = article.authors
            data = {
                'title': article.title,
                'genre': topic,
                'date_published': date,
                'news_outlet': newspaper,
                'authors': authors,
                'feature_img': article.top_image,
                'link': article.canonical_link,
                'keywords': article.keywords,
                'summary': article.summary,
                'text': article.text
                # 'movies': (article.movies).tolist(),
                # 'html': article.html
            }
            print("title:", data['title'])

            if count < 1:  # print 1 article
                # print("data['title']")
                # print(data['title'])
                print(
                    "-----------------------------Article dit---------------------------"
                )
                print("date_published:", data['date_published'])
                print("genre:", data['genre'])
                print("authors:", data['authors'])
                print("link:", data['link'])
                print("keywords:", data['keywords'])
                print("summary:", data['summary'])
                print(
                    "--------------------------------------------------------")
                print("text:", data['text'])
                print(
                    "--------------------------------------------------------")

            # print
            # print

            if data['text']:
                self.add_article(newspaper, data)

            count += 1
            print(count)
            time.sleep(sleep_time)

        return results
Exemple #31
0
class ArticleTestCase(unittest.TestCase):
    def runTest(self):
        print 'testing article unit'
        self.test_url()
        self.test_download_html()
        self.test_pre_download_parse()
        self.test_parse_html()
        self.test_meta_type_extraction()
        self.test_meta_extraction()
        self.test_pre_parse_nlp()
        self.test_nlp_body()

    def setUp(self):
        """called before the first test case of this unit begins"""

        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
                'thanksgiving/index.html?iref=allsearch')

    def tearDown(self):
        """Called after all test cases finish of this unit
        """
        pass

    @print_test
    def test_url(self):
        assert self.article.url == (
            u'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html')

    @print_test
    @responses.activate
    def test_download_html(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.download()
        assert len(self.article.html) == 75244

    @print_test
    def test_pre_download_parse(self):
        """Before we download an article you should not be parsing!
        """
        article = Article(self.article.url)

        def failfunc():
            article.parse()
        self.assertRaises(ArticleException, failfunc)

    @print_test
    @responses.activate
    def test_parse_html(self):
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        DOMAIN = 'www.cnn.com'
        SCHEME = 'http'
        AUTHORS = ['Dana Ford', 'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()
        with open(os.path.join(TEST_DIR, 'data/cnn.txt'), 'r') as f:
            assert self.article.text == f.read()
        assert self.article.top_img == TOP_IMG
        assert self.article.authors == AUTHORS
        assert self.article.title == TITLE
        assert len(self.article.imgs) == LEN_IMGS
        assert self.article.meta_lang == META_LANG

    @print_test
    @responses.activate
    def test_meta_type_extraction(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()

        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        assert 'article' == meta_type

    @print_test
    @responses.activate
    def test_meta_extraction(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()

        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(dict, {
            'medium': 'news',
            'googlebot': 'noarchive',
            'pubdate': '2013-11-27T08:36:32Z',
            'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
            'og': {'site_name': 'CNN','description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'},
            'section': 'travel',
            'author': 'Dana Ford and Tom Watkins, CNN',
            'robots': 'index,follow',
            'vr': {'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'},
            'source': 'CNN',
            'fb': {'page_id': 18793419640, 'app_id': 80401312489},
            'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
            'article': {'publisher': 'https://www.facebook.com/cnninternational'},
            'lastmod': '2013-11-28T02:03:23Z',
            'twitter': {'site': {'identifier': '@CNNI', 'id': 2097571}, 'card': 'summary', 'creator': {'identifier': '@cnntravel', 'id': 174377718}},
            'viewport':'width=1024',
            'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
        })

        assert meta == META_DATA

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = filter(lambda v: isinstance(v, dict), meta.values())
        assert all(map(lambda d: len(d) > 0, dict_values))

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        assert len(filter(is_dict, meta.values())) == 5

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, types.StringTypes)
        assert len(filter(is_string, meta.values())) == 12

    @print_test
    @responses.activate
    def test_pre_download_nlp(self):
        """Test running NLP algos before even downloading the article"""

        mock_response_with(self.article.url, 'cnn_article')

        def failfunc():
            self.article.nlp()
        self.assertRaises(ArticleException, failfunc)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLP algos before parsing the article"""

        article = Article(self.article.url)
        article.download()

        def failfunc():
            article.nlp()
        self.assertRaises(ArticleException, failfunc)

    @print_test
    @responses.activate
    def test_nlp_body(self):
        SUMMARY = """Wish the forecasters were wrong all the time :)"Though the worst of the storm has passed, winds could still pose a problem.\r\nForecasters see mostly smooth sailing into Thanksgiving.\r\nThe forecast has left up in the air the fate of the balloons in Macy's Thanksgiving Day Parade.\r\nThe storm caused some complications and inconveniences, but no major delays or breakdowns.\r\n"That's good news for people like Latasha Abney, who joined the more than 43 million Americans expected by AAA to travel over the Thanksgiving holiday weekend."""

        KEYWORDS = [
            u'great', u'good', u'flight', u'sailing', u'delays',
            u'smooth', u'thanksgiving', u'snow', u'weather', u'york',
            u'storm', u'winds', u'balloons', u'forecasters']

        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()
        self.article.nlp()
        # print self.article.summary
        # print self.article.keywords
        assert self.article.summary == SUMMARY
        assert self.article.keywords == KEYWORDS