Exemple #1
0
def get_details():
    url = request.args.get('url', '')
    if not url:
      abort(400)

    if is_image(url):
      result = {
        "url": url,
        "top_image": url,
        "text": "",
      }
      return jsonify(result)

    article = Article(url)
    article.download()

    try:
      article.parse()
    except (IOError, UnicodeDecodeError):
      return '', 422

    try:
      top_image = article.top_image.rsplit('?',1)[0]
    except AttributeError:
      top_image = ''

    result = {
      "url": url,
      "top_image": top_image,
      "text": article.text,
    }

    return jsonify(result)
Exemple #2
0
def get_article():
	tree_urls = ET.parse("DB_urls.xml")
	root_urls = tree_urls.getroot()

	# The problem with English and Chinese can be solved with 
	for field_urls in root_urls.findall("row"):
		url_urls = field_urls.find("field").text
	#	url_urls = 'http://news.sina.com.cn/c/2014-04-21/204729980947.shtml'
	#	url_urls = 'http://china.caixin.com/2013-12-30/100623243.html'

		try:
			response = urllib2.urlopen(url_urls)
			status = response.code

			#print "detected webpage code:", status

			if(status == 404):
				continue
			else:
				a_zh = Article(url_urls, language = 'zh')
				a_zh.download()
				a_zh.parse()
				content_urls = a_zh.text

				if(content_urls == ''):
					a_en = Article(url_urls, language = 'en')
					a_en.download()
					a_en.parse()
					content_urls = content_urls + a_en.text

				if(content_urls != ''):
					compare_article(url_urls, content_urls)			
		except:
			pass
Exemple #3
0
 def test_spanish_fulltext_extract(self):
     url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
     article = Article(url=url, language="es")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Exemple #4
0
def get_image():
  url = request.args.get('url', '')
  if not url:
    abort(400)

  if is_image(url):
    return redirect(url)

  article = Article(url)
  article.download()

  try:
    article.parse()
  except (IOError, UnicodeDecodeError):
    return '', 422

  try:
    top_image = article.top_image.rsplit('?',1)[0]
  except AttributeError:
    top_image = ''

  if not top_image == '':
    return redirect(top_image)
  else:
    return '', 422
Exemple #5
0
 def test_pre_parse_nlp(self):
     """Test running NLP algos before parsing the article
     """
     new_article = Article(self.article.url)
     resp = mock_response_with(new_article.url, 'cnn_article')
     new_article.download(resp)
     self.assertRaises(ArticleException, new_article.nlp)
Exemple #6
0
 def test_chinese_fulltext_extract(self):
     url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
     article = Article(url=url, language="zh")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Exemple #7
0
 def test_arabic_fulltext_extract(self):
     url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html"
     article = Article(url=url, language="ar")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Exemple #8
0
    def run(self):
        logging.debug("run() - [WAIT]")
        from newspaper import Article

        '''
        Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm
        '''

        NOTES_LIST = [
            '118',
            '117',
            # '116',
            # '115',
        ]
        for note_id in NOTES_LIST:
            note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id)
            note.download()

            print '*' * 100
            # print 'H T M L'
            # print note.html
            #print '*' * 100
            # print 'T E X T'
            note.parse()
            print note.text


        logging.debug("run() - [DONE]")
Exemple #9
0
def get_nlp_data(url):
	article = Article(url)
	article.download()
	article.parse()
	article.nlp()
	
	return json.dumps(article.keywords)
Exemple #10
0
 def test_pre_parse_nlp(self):
     """Test running NLP algos before parsing the article
     """
     new_article = Article(self.article.url)
     html = mock_resource_with('cnn_article', 'html')
     new_article.download(html)
     self.assertRaises(ArticleException, new_article.nlp)
Exemple #11
0
def main():
    try:
        headlines = requests.get(headline_url)
        
        headlines = json.loads(headlines.text)
        for headline in headlines['Headlines']:
            print("Processing Article %s" % headline['Url'])
            article = Article(headline['Url'])
            article.download()
            article.parse()
            
            
            response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
            rdf = json.loads(response.text)
            
            for x in rdf:
                if '_type' in rdf[x] and 'name' in rdf[x]:
                    print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
                    for instance in rdf[x]['instances']:
                        text = instance['prefix'] + instance['suffix']
                        blob = TextBlob(text)
                        for sentence in blob.sentences:
                            print(sentence)
                            print(sentence.sentiment.polarity)
            print('--------------------')
            
            #print(rdf)
    except Exception as e:
        print ('Error in connect ' , e)
Exemple #12
0
 def check_url(args):
     """
     :param (basestr, basestr) url, res_filename:
     :return: (pubdate_failed, fulltext_failed)
     """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed',
                    res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed
Exemple #13
0
def post_new(request):
    if request.method == "POST":
        form = PostForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            post.author = request.user
            post.published_date = timezone.now()
            post.save()
            return redirect('blog.views.post_detail', pk=post.pk)
    elif request.method == 'GET':
        url = request.GET.get('url', '')
               
        if len(url) > 5:
            article = Article(url, language='en')
            article.download()
            article.parse()
            article.nlp()
            image = article.top_image
            summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
            title = article.title.replace(u'\u2019',"\'")
            source = url.split('//')[1].split('/')[0].replace('www.','')
            status = 'UD'
            form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) 
        else:
            form = PostForm() 

    return render(request, 'blog/post_edit.html', {'form': form})
Exemple #14
0
def f(url):
	url_urls = url.text
	try:
		response = urllib2.urlopen(url_urls)
		status = response.code

		#print "detected webpage code:", status

		if(status == 404):
			pass
		else:
			a_zh = Article(url_urls, language = 'zh')
			a_zh.download()
			a_zh.parse()
			# content_urls = a_zh.text

			# if(content_urls == ''):
			# 	a_en = Article(url_urls, language = 'en')
			# 	a_en.download()
			# 	a_en.parse()
			# 	content_urls = content_urls + a_en.text

			# if(content_urls != ''):
			# 	pass
			# 	# compare_article(url_urls, content_urls)			
	except:
		pass
Exemple #15
0
def scrapeURLS(inFilPath):
    texts = []
    cache = loadCache()
    toDelURLs = []
    with open(inFilPath) as f:
        urls = f.readlines()
    for url in urls:
        if filter(urlFilters, url):
            toDelURLs.append(url)
            
        if url in cache:
            txt = cache[url]
        else:
            print "Scraping URL %s" % url
            article = Article(url)
            article.download()
            article.parse()
            txt = article.text.replace("\n", " ").replace("  ", " ").strip()
            if txt == "" or filter(txtFilter, txt):
                toDelURLs.append(url)
                continue
            cacheURL(url, txt)
        texts.append(txt)
        deleteURLs(inFilPath, toDelURLs)
    return texts
Exemple #16
0
 def test_download_file_failure(self):
     url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
     article = Article(url=url)
     article.download()
     self.assertEqual(0, len(article.html))
     self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE)
     self.assertEqual(article.download_exception_msg, "No such file or directory")
Exemple #17
0
def show_article():
    url_to_clean = request.args.get('url_to_clean')
    if not url_to_clean:
        return redirect(url_for('index'))

    article = Article(url_to_clean)
    article.download()
    article.parse()

    try:
      html_string = ElementTree.tostring(article.clean_top_node)
    except:
      html_string = "Error converting html to string."

    try:
      article.nlp()
    except:
      log.error("Couldn't process with NLP")

    a = {
          'html': html_string, 
         'authors': str(', '.join(article.authors)), 
         'title': article.title,
         'text': article.text,
         'top_image': article.top_image,
         'videos': str(', '.join(article.movies)),
         'keywords': str(', '.join(article.keywords)),
         'summary': article.summary
         }
    return render_template('article/index.html', article=a, url=url_to_clean)
    
Exemple #18
0
def extract():
  url = sys.argv[1:].pop()

  a = Article(url, keep_article_html=True)
  a.download()
  a.parse()
  a.nlp()

  parsed_uri = urlparse(a.source_url)
  domain = '{uri.netloc}'.format(uri=parsed_uri)

  try:
    publish_date = a.publish_date.strftime('%Y-%m-%d %H:%M')
  except AttributeError:
    publish_date = ""

  try:
    authors = ", ".join(a.authors)
  except AttributeError:
    authors = ""

  result = {}
  result['html'] = a.html
  result['body'] = a.text
  result['title'] = a.title
  result['top_image'] = a.top_image
  result['author'] = authors
  result['html_body'] = a.article_html
  result['favicon'] = a.meta_favicon
  result['description'] = a.summary
  result['publish_date'] = publish_date
  result['keywords'] = a.keywords
  result['sitename'] = re.sub(r"^www.", "", domain)

  return json.dumps(result).encode('utf-8')
Exemple #19
0
 def test_download_file_success(self):
     url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
     article = Article(url=url)
     article.download()
     self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS)
     self.assertEqual(article.download_exception_msg, None)
     self.assertEqual(75406, len(article.html))
Exemple #20
0
def extract(url=None, keep_html=True):
    """ Attempts to extract article from URL """
    a = Article(url, keep_article_html=keep_html)
    try:
        a.download()
    except Exception, e:
        log.error('Error downloading %s: %s' % (url, str(e)))
Exemple #21
0
def get_news():
    urls = get_urls()
    news = News.query.with_entities(News.source_url).all()

    used_urls = []
    for n in news:
        used_urls.append(n[0])

    for url in urls:
        if not url in used_urls:
            used_urls.append(url)

            article = Article(url, language='pt', keep_article_html=True)
            article.download()
            article.parse()
            article.nlp()

            news_article = News(url)
            news_article.slug = slugify(article.title)
            news_article.title = article.title
            news_article.text = article.text
            news_article.top_image = article.top_image
            news_article.summary = article.summary
            news_article.article_html = article.article_html
            news_article.created_at = datetime.datetime.now()

            exists_this_news = News.query.filter_by(source_url=url).first()

            if not exists_this_news:
                print(url)
                db.session.add(news_article)
                db.session.commit()
 def get_article_by_url(url):
     article = Article(url, fetch_images=False)
     article.download()
     if url == "empty":
         return "nolist"
     article.parse()
     return article.text
Exemple #23
0
def get_article(url):
    a = Article(url)
    a.download()
    a.parse()

    article = dict()

    article['title'] = a.title
    article['publish_date'] = a.published_date
    article['authors'] = a.authors
    article['lead_image'] = a.top_image
    article['movies'] = a.movies
    article['text'] = a.text
    article['keywords'] = get_keywords(a.text)


    # This is more likely to fail.
    # try:
    #     article.nlp()
    #     article['summary'] = 'This summary is generated: \n ' + a.summary
    # except Exception:
    #     print Exception
    #     article['summary'] = a.summary

    return article
def insert_url(url):
    conn = sqlite3.connect('publico_news_sqllite3.db')
    cursor = conn.cursor()

    # get the article in plain text
    article = Article(url)
    article.download()
    article.parse()
    date = article.publish_date
    title = article.title
    text = article.text

    item = dict()
    item['datetime'] = date
    item['title'] = title
    item['text'] = text
    item['category'] = sys.argv[1].split('/')[6]
    item['link'] = sys.argv[1]
    item['origLink'] = sys.argv[1]

    print(item['category'])
    print(item['datetime'])

    if not duplicate(item, item['category'], cursor):
        status = insert_db(item, item['category'], cursor)
        if status == 1:
            print(sys.argv[1], "inserted")
        else:
            print("Error", status)
    else:
        print(url, "already in BD")

    conn.commit()
    conn.close()
Exemple #25
0
def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"
 def test2(self):
     articles =[
      'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
      'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
      'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
      ]
     
     articles = [
      'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
      'http://www.bbc.co.uk/news/uk-wales-35954982',
      'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
      'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
      'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
      'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
      'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
      'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
      'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
     
     with open("./Output2.txt", "w") as text_file:
         for url in articles:
             print(url)
             a = Article(url)
             a.download()
             a.parse()
             text_file.write(a.text.encode('utf-8'))
             text_file.write('\n')
Exemple #27
0
def is_valid_article(link):
    print("Checking valid:\n" + link)

    if "cnn.com" not in link:
        return False
    if "html" not in link:
        return False
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    keywords = article.keywords

    matched = False

    for key in keywords:
        if key in nc_set:
            matched = True
    for key in keywords:
        if key in contorversial_set:
            matched = False

    if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
        main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
        visited_articles.write(link+"\n")
        return True

    return False
Exemple #28
0
    def parse_news(self, response):
        item = ScrapyGooglenewsItem()
        #only log the warning info from request
        logging.getLogger("requests").setLevel(logging.WARNING)

        for href in response.xpath('//h2[@class="title"]/a/@href').extract():
            item['link'] = href
            #use newspaper-0.0.8 to scrape the webpage, then get clean text.
            article = Article(item['link'])
            article.download()
            article.parse()
            item['title'] = article.title
            item['text'] = article.text
            #item['authors'] = article.authors
            #item['date'] = article.publish_date

            if response.url.split('&')[-1] == 'topic=w':
                item['domain'] = 'World'
            if response.url.split('&')[-1] == 'topic=n':
                item['domain'] = 'U.S.'
            if response.url.split('&')[-1] == 'topic=b':
                item['domain'] = 'Business'
            if response.url.split('&')[-1] == 'topic=tc':
                item['domain'] = 'Technology'
            if response.url.split('&')[-1] == 'topic=e':
                item['domain'] = 'Entertainment'
            if response.url.split('&')[-1] ==  'topic=s':
                item['domain'] = 'Sports'
            if response.url.split('&')[-1] ==  'topic=snc':
                item['domain'] = 'Science'
            if response.url.split('&')[-1] ==  'topic=m':
                item['domain'] = 'Health'

            yield item
Exemple #29
0
    def runTest(self):
        # The "correct" fulltext needs to be manually checked
        # we have 50 so far
        FULLTEXT_PREPARED = 50
        domain_counters = {}

        with open(URLS_FILE, 'r') as f:
            urls = [d.strip() for d in f.readlines() if d.strip()]

        for url in urls[:FULLTEXT_PREPARED]:
            domain = get_base_domain(url)
            if domain in domain_counters:
                domain_counters[domain] += 1
            else:
                domain_counters[domain] = 1

            res_filename = domain + str(domain_counters[domain])
            html = mock_resource_with(res_filename, 'html')
            try:
                a = Article(url)
                a.download(html)
                a.parse()
            except Exception:
                print('<< URL: %s parse ERROR >>' % url)
                traceback.print_exc()
                continue

            correct_text = mock_resource_with(res_filename, 'txt')
            condensed_url = url[:30] + ' ...'
            print('%s -- fulltext status: %s' %
                  (condensed_url, a.text == correct_text))
Exemple #30
0
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')
Exemple #31
0
def main_func(search_term):
	search_term = url_encode(search_term)
	browser = None
	browser = webdriver.Chrome("chromedriver 2")
	scrapeCNN(browser, search_term)
	scrapeBBC(browser, search_term)
	scrapeFOX(browser, search_term)
	export_json()


	# Set the limit for number of articles to download
	LIMIT = 30
	data = {}
	data['newspapers'] = {}


	documents = {
		
		"documents":[]
	}


	count = 1


	# Iterate through each news company
	for company, value in all_data.items():
		if 'rss' in value:
			d = fp.parse(value['rss'])
			print("Downloading articles from ", company)

			newsPaper = {
				"rss": value['rss'],
				"link": value['link'],
				"articles": []
			}

			for entry in d.entries:
				# Check if publish date is provided, if no the article is skipped.
				# This is done to keep consistency in the data and to keep the script from crashing.
				if hasattr(entry, 'published'):
					if count > LIMIT:
						break
					article = {}
					article['link'] = entry.link
					date = entry.published_parsed
					article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
					try:
						content = Article(entry.link)
						content.download()
						content.parse()
					except Exception as e:
						# If the download for some reason fails (ex. 404) the script will continue downloading
						# the next article.
						print(e)
						print("continuing...")
						continue
					article['title'] = content.title
					article['text'] = content.text
					newsPaper['articles'].append(article)
					print(count, "articles downloaded from", company, ", url: ", entry.link)
					count = count + 1


		else:
			# This is the fallback method if a RSS-feed link is not provided.
			# It uses the python newspaper library to extract articles
			print("Building site for ", company)


			for link in value['link']:
				content = Article(link)

				newsPaper = {
					"link": link,
					"articles": []
				}
				
				noneTypeCount = 0
				
					
				if count > LIMIT:
					break
				try:
					content.download()
					content.parse()
				except Exception as e:
					print(e)
					print("continuing...")
					continue
				# Again, for consistency, if there is no found publish date the article will be skipped.
				# After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.

				article = {}
				article['title'] = content.title
				article['text'] = content.text
				article['link'] = content.url
				if content.publish_date is not None:
					article['published'] = content.publish_date.isoformat()
				newsPaper['articles'].append(article)

				info = {}

				if len(content.text) < 5100:
					info["id"] = company+str(count)
					info["title"] = content.title
					info['link'] = content.url
					info['source'] = company
					info["language"] = "en"
					info["text"] = content.text

					documents["documents"].append(info)


					print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
					count = count + 1
					noneTypeCount = 0

					data['newspapers'][company] = newsPaper

	run_sample()
def article_title(articleTitle,
                  c):  #Article titles to txt file: '2- TitleList.txt'
    with open('2- TitleList.txt', 'a', encoding='utf-8') as file:
        file.write(c + '\n' + articleTitle + '\n')


urlfile = '1- url.txt'

with open(urlfile) as f:  #Read urls and download news text and title
    line = f.readline()
    c = 0
    error = 1
    while line:
        line = line.rstrip('\n')
        a = Article(line, source_url="https://time.com", config=config)
        a.download()
        try:  #Use try/except for urls with read timeout error
            a.parse()
            text_to_file(a.text.lower())
            article_title(a.title.lower(), str(c))
        except:
            error += 1
            pass

        a = 'None'
        line = f.readline()
        clear_output(wait=True)
        print(c)
        time.sleep(2)
        c += 1
Exemple #33
0
def textgetter(url):
    """Scrapes web news and returns the content

    Parameters
    ----------

    url : str
        web address to news report

    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article

    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group()).group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            yield done[url]
            pass
        try:
            # make a request to the url
            r = requests.get(url, verify=False, timeout=1)
        except:
            # if the url does not return data, set to empty values
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords'] = None
            answer['summary'] = None
            yield answer
        # if url does not return successfully, set ot empty values
        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords'] = None
            answer['summary'] = None

        # test if length of url content is greater than 500, if so, fill data
        if len(r.content) > 500:
            # set article url
            article = Article(url)
            # test for python version because of html different parameters
            if int(platform.python_version_tuple()[0]) == 3:
                article.download(input_html=r.content)
            elif int(platform.python_version_tuple()[0]) == 2:
                article.download(html=r.content)
            # parse the url
            article.parse()
            article.nlp()
            # if parse doesn't pull text fill the rest of the data
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider'] = site
                answer['published_date'] = article.publish_date
                answer['keywords'] = article.keywords
                answer['summary'] = article.summary
                # convert the data to isoformat; exception for naive date
                if isinstance(article.publish_date, datetime.datetime):
                    try:
                        answer[
                            'published_date'] = article.publish_date.astimezone(
                                pytz.utc).isoformat()
                    except:
                        answer[
                            'published_date'] = article.publish_date.isoformat(
                            )

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url

            # if previous didn't work, try another library
            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                # as we did above, pull text if it's greater than 200 length
                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider'] = site
                    answer['published_date'] = None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords'] = None
                    answer['summary'] = None
                # if nothing works above, use beautiful soup
                else:
                    newstext = " ".join([
                        l.text
                        for l in soup.find_all('div', class_='field-item even')
                    ])
                    done[url] = newstext
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider'] = site
                    answer['published_date'] = None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords'] = None
                    answer['summary'] = None
        # if nothing works, fill with empty values
        else:
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = 'No text returned'
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords'] = None
            answer['summary'] = None
            yield answer
        yield answer

    # the else clause to catch if invalid url passed in
    else:
        answer['author'] = None
        answer['base'] = s.search(url).group()
        answer['provider'] = site
        answer['published_date'] = None
        answer['text'] = 'This is not a proper url'
        answer['title'] = None
        answer['top_image'] = None
        answer['url'] = url
        answer['keywords'] = None
        answer['summary'] = None
        yield answer
Exemple #34
0
    except:
        break

d = dict()
c = 0
print('---------------------------------')
#from Com import Com
r = list()
with codecs.open("withnewspaper.csv", "w", "utf-8") as file:
    file.write('publishedAt,title,author,url,urlToImage,description,comments' +
               '\r\n')
    for i in a:
        print(i)
        url = i
        article = Article(url)
        article.download()
        article.parse()
        # p=Com(url)
        #print(p.get_comment)
        title = article.title
        authors = article.authors
        publish_date = article.publish_date
        text = article.text
        top_image = article.top_image
        '''print(type(title.encode('UTF-8')))
        print(type(authors))
        print(type(str(publish_date)))
        print(type(text))
        print(type(top_image))'''
        html = (article.html)
        soup = BeautifulSoup(html, 'lxml')
Exemple #35
0
# article的使用方法
from newspaper import Article

# print(help(Article))
url = r'https://new.qq.com/omn/20180705/20180705A0T4T4.html'
article = Article(url)  # an online news article page

article.download()  # 下载文章

html = article.html  # 网页源代码
# print(html)

article.parse()  # 文章解析

authors = article.authors  # 文章作者
# print(authors)

publish_date = article.publish_date  # 文章发布时间
# print(publish_date)

text = article.text  # 文章内容
# print(text)

top_image = article.top_image  # 第一张图片
# print(top_image)

movies = article.movies  # 视频链接
# print(movies)

title = article.title  # 文章标题
print(title)
Exemple #36
0
def goog_news(sch_word,yyyymm,pages=3,smry_words=50):
    quo_word = quote_plus(sch_word)
    mon = pd.to_datetime(yyyymm,format='%Y%m')
    mon_max = mon+pd.DateOffset(months=1)-pd.DateOffset(days=1)
    mrng = list(map(lambda x: x.strftime('%m/%d/%Y'),[mon,mon_max]))
    
    # rescnt 부분
    links = []
    driver = webdriver.Chrome('driver/chromedriver.exe')
    urs0 = f"url = f"https://www.google.com/search?q={quo_word}&safe=active&rlz=1C1SQJL_koKR831KR832&biw=763&bih=625&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{mrng[0]}%2Ccd_max%3A{mrng[1]}&tbm=nws"
    driver.get(url)
    html0 = driver.page_source
    try:
        a0 = soup(html,'lxml')
        rescnt = a0.find('div',id='resultStat').get_text()
    except:
        rescnt = ''
    driver.close()
    # rescnt 부분끝
        
    for i in np.arange(0,pages*10,10):
        url = f"https://www.google.com/search?q={quo_word}&safe=active&rlz=1C1SQJL_koKR831KR832&biw=763&bih=625&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{mrng[0]}%2Ccd_max%3A{mrng[1]}&tbm=nws&start={i}"  
        driver.get(url)
        html = driver.page_source
        a = soup(html,'lxml')
        b1 = a.find_all('a','l lLrAF')
        b2 = a.find_all('a','RTNUJf')
        links.append([h['href'] for h in b1+b2])
    
    driver.close()
    links = list(itertools.chain(*links))
    
    title,date,wd,text,smry,press = [],[],[],[],[],[]
  
    for h in links:
        press.append(re.split('/',h)[2])
        a = Article(h)
        try:
            a.download()
            a.parse()
        except:
            next
    
        try:
            title.append(a.title)
        except:
            title.append('')

        try:
            dat = a.publish_date
            date.append(dat.strftime('%Y-%m-%d'))
            wd.append(dat.strftime('%a'))
        except:
            date.append('')
            wd.append('')

        try:
            text.append(a.text)
            smry.append(summarize(a.text,word_count=smry_words))
        except:
            text.append('')
            smry.append('')

    news = pd.DataFrame({'mon':yyyymm,'keyword':sch_word,'rescnt':rescnt,'title':title,'date':date,'wkday':wd,
                         'text':text,'smry':smry,'press':press})
    news = news.loc[news.text!='']
    news = news.drop_duplicates()
    news.reset_index(drop=True,inplace=True)
    
    return news
Exemple #37
0
def main(Url, pub_time, found_time, Source, Keywords, otherNames, Type):
    Keywords = Keywords.lower()
    article = Article(Url)
    article.download()
    if article.is_downloaded:
        article.parse()
        if article.is_parsed:
            print "parsed"
            article.nlp()
    else:
        print "failed download"
        article = urllib.urlopen(Url).read()
        article.download()
        article.parse()
    articleText = (article.text)
    articleText = articleText.encode('ascii', 'replace').replace(
        u"\u0029", "").replace(u"\u0028", "")

    Keywords = Keywords.split(",")
    classifier = '/usr/local/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'
    jar = '/usr/local/share/stanford-ner/stanford-ner.jar'
    st = StanfordNERTagger(classifier, jar, encoding='utf-8')
    sentence = word_tokenize(articleText)
    output = []
    realtypefind = []
    keywordtotalcount = {}
    count = {}
    categories = defaultdict(list)
    totalcount = 0

    for key in Keywords:
        keywordtotalcount[key] = 0
        for key2 in key.split():
            count[key2] = 0

    itemposition = 0
    totoltypecount = 0
    taged = st.tag(sentence)
    for item in taged:
        firstItem = item[0].encode('utf-8').strip("\)(?.,:`")
        if firstItem:
            if item[1] not in categories:
                categories[item[1]].append(firstItem)
            else:
                categories[item[1]].append(firstItem)
            if item[1] == Type:
                totoltypecount = totoltypecount + 1
                #Creats full name list, is checked against to make sure a article with mike newton is counting mike johnson or sam newton
                #as people who are mentioned in the article.
                if itemposition != (len(taged) - 1):
                    if taged[itemposition + 1][1] == Type:
                        realtypefind.append(" " + (item[0].lower() + " " +
                                                   (taged[itemposition + 1][0]
                                                    ).lower()).encode('utf-8'))
                output.append(item[0])
                if item[0].lower() in count:
                    count[item[0].lower()] = count[item[0].lower()] + 1
        itemposition = itemposition + 1

    #Creats full name list, is checked against to make sure a article with mike newton is counting mike johnson or sam newton
    #as people who are mentioned in the article.

    for key in keywordtotalcount:
        for T in range(0, len(key.split())):
            (keywordtotalcount[key]
             ) = (keywordtotalcount[key]) + count[(key.split())[T]]

    frequency = (FreqDist(output)).most_common(5)

    for freq in frequency:
        totalcount = totalcount + freq[1]

    keywords_database = ' '.join(article.keywords)

    article_people = []

    for person in keywordtotalcount:
        if person in realtypefind:
            if person in otherNames and otherNames[person] in realtypefind:
                article_people.append(person)
                totalcountofperson = (keywordtotalcount[person] +
                                      keywordtotalcount[otherNames[person]])
                # print person, "is in the article", (round(((keywordtotalcount[person] + keywordtotalcount[otherNames[person]])/float(totoltypecount)), 4) * 100), "%"
                # Sqlite_py_practice.main(Url, Source, post_date, dateTime, article.title, str(article.authors), str(keywords_database), article.summary, articleText)
            else:
                article_people.append(person)
                totalcountofperson = keywordtotalcount[person]
                # print person, "is in the article", (round((keywordtotalcount[person]/float(totoltypecount)), 4) * 100), "%"
                # Sqlite_py_practice.main(Url, Source, post_date, dateTime, article.title, str(article.authors), str(keywords_database), article.summary, articleText)
        else:
            if person in otherNames and otherNames[person] in realtypefind:
                article_people.append(person)
                totalcountofperson = keywordtotalcount[person]
                # print person, "is in the article", (round((keywordtotalcount[person]/float(totoltypecount)), 4) * 100), "%"
                # Sqlite_py_practice.main(Url, Source, post_date, dateTime, article.title, str(article.authors), str(keywords_database), article.summary, articleText)

    if len(article_people) >= 1:
        print Url
        article_id = mysql_article_entry.main(Url, Source, pub_time,
                                              found_time, article.title,
                                              str(article.authors),
                                              str(keywords_database),
                                              article.summary, articleText)
        mysql_article_person_link.main(
            article_id, article_people, totalcountofperson, (round(
                (totalcountofperson / float(totoltypecount)), 4) * 100),
            totoltypecount)
        mysql_article_based_weights.main(article_id, len(articleText), "yes")
        mysql_social_media_entry.main(article_id, Url)
 def get_article(self, url):
     article = Article(url)
     article.download()
     article.parse()
     article.nlp()
     return article
Exemple #39
0
def extract_news(code, news_links, crawl_source, cursor):
    '''抽取新闻,并进行NLP
    @param code: 上市公司编码
    @param news_links: 需要抽取的新闻链接
    @param crawl_source
    @param cursor: 数据库游标
    '''

    in_sql = """ INSERT INTO news_extract_content(url_md5,url,code_name,newspaper_title,newspaper_text,
newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article,
boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,
boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,extract_count,crawl_source)
VALUES
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1,%s)
on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s,
newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s,
boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s,
boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s,
up_time = now(),extract_count=extract_count+1,crawl_source = %s """

    for link in news_links:
        #长度小于30的url一般都不是新闻连接,暴力,简单可依赖
        if link is None or len(link) <= 30:
            continue
        #已经抓取的url就不需要抓取了
        if link in bf:
            continue

        try:
            global NEWS_URL_EXTRACTE
            NEWS_URL_EXTRACTE += 1
            url_md5 = hashlib.md5(link).hexdigest()
            #首先让使用newspaper
            newspaper_title = ''
            newspaper_text = ''
            newspaper_authors = ''
            newspaper_summary = ''
            newspaper_keywords = ''
            article = Article(link)
            article.download()
            html = article.html
            if html is None or len(html) == 0:
                continue
            article.parse()
            if article.text and len(article.text) > 0:
                newspaper_title = article.title
                newspaper_text = article.text
                newspaper_authors = article.authors
                if newspaper_authors and len(newspaper_authors) > 0:
                    newspaper_authors = ','.join(newspaper_authors)
                else:
                    newspaper_authors = ''

                article.nlp()
                newspaper_summary = article.summary
                newspaper_keywords = article.keywords
                if newspaper_keywords and len(newspaper_keywords) > 0:
                    newspaper_keywords = ','.join(newspaper_keywords)
                else:
                    newspaper_keywords = ''

            #然后使用boilerpipe

            extractor = Extractor(extractor='ArticleExtractor', html=html)
            boilerpipe_article = extractor.getText()

            extractor = Extractor(extractor='ArticleSentencesExtractor',
                                  html=html)
            boilerpipe_articlesentences = extractor.getText()

            extractor = Extractor(extractor='KeepEverythingExtractor',
                                  html=html)
            boilerpipe_keepeverything = extractor.getText()

            extractor = Extractor(extractor='LargestContentExtractor',
                                  html=html)
            boilerpipe_largestcontent = extractor.getText()

            extractor = Extractor(extractor='NumWordsRulesExtractor',
                                  html=html)
            boilerpipe_numwordsrules = extractor.getText()

            extractor = Extractor(extractor='CanolaExtractor', html=html)
            boilerpipe_canola = extractor.getText()

            #输入的参数
            content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\
                       boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\
                       boilerpipe_numwordsrules,boilerpipe_canola,crawl_source,   \
                       code, newspaper_title,newspaper_text, newspaper_authors,\
                       newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\
                       boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola,crawl_source)
            cursor.execute(in_sql, content)

        except:
            logger.error("crawl_page failed ,Error:%s" %
                         traceback.format_exc())
 def get_article_text(self, url):
     article = Article(url)
     article.download()
     article.parse()
     return article.text
    def parse_content(self, response):
        #这个函数用作新闻的具体解析

        ID = 'songtengteng'

        website_name = '商务部贸易救济调查局'

        # 网站板块
        website_block = response.xpath(
            "//div[@class='position']/a[2]/text()").extract_first()

        news_url = response.meta['url']

        # 作者
        news_author_list = response.xpath('//script')
        if len(news_author_list) != 0:
            news_author = news_author_list.re(
                'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"')[0][13:].replace(
                    '"', '')
        else:
            news_author = '商务部贸易救济调查局'

        # 新闻发布时间,统一格式:YYYY MM DD HH:Mi:SS
        publish_time = response.xpath('//script').re(
            'v.{2}\stm\s=\s\".*\"')[0][9:].replace('"', '')
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-8:]
        publish_time = year + month + day + ' ' + juti_time

        # 新闻自带标签
        news_tags = response.xpath('//script').re(
            'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '')

        # 新闻标题
        news_title = response.xpath('//h3/text()').extract_first()

        # 新闻正文
        a = Article(response.url, language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        #获取文章的图片和名称
        image_urls = []
        image_names = []
        image_urls1 = response.xpath(
            '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src'
        ).extract()
        if image_urls1 != []:
            image_urls = image_urls1
            for i in range(len(image_urls)):
                if i < 10 and i >= 0:
                    image_name = news_title + '_000' + str(i)
                    image_names.append(image_name)
                elif i < 100 and i >= 10:
                    image_name = news_title + '_00' + str(i)
                    image_names.append(image_name)
                elif i < 1000 and i >= 100:
                    image_name = news_title + '_0' + str(i)
                    image_names.append(image_name)
                else:
                    image_name = news_title + str(i)
                    image_names.append(image_name)

        yield self.getItem(
            id=ID,
            news_url=news_url,
            website_name=website_name,
            website_block=website_block,
            news_title=news_title,
            publish_time=publish_time,
            news_author=news_author,
            news_tags=news_tags,
            news_content=news_content,
            image_urls=image_urls,
            image_names=image_names,
        )
Exemple #42
0
    def parse_artical(self, response):  # 具体文章解析
        ID = 'songtengteng'

        # 新闻链接
        news_url = response.meta['url']

        # 新闻标题
        news_title = response.xpath('//h1/text()').extract_first()
        # 作者
        a = response.xpath(
            '//div[@class="info-source"]/span/a/text()').extract_first()
        if a == None:
            news_author = ''
        else:
            news_author = a

        # 发布时间
        publish_time = response.xpath(
            '//div[@class="info-source"]/span[2]/text()').extract_first()
        if publish_time != None:
            year = publish_time[0:4]
            month = publish_time[5:7]
            day = publish_time[8:10]
            juti_time = publish_time[-5:]
            publish_time = year + month + day + ' ' + juti_time + ':' + '00'
        else:
            publish_time = response.xpath(
                '//*[@id="bd-left"]/div[2]/div[1]/div[1]/div[1]/span[2]/text()'
            ).extract_first()
            if publish_time != None:
                year = publish_time[0:4]
                month = publish_time[5:7]
                day = publish_time[8:10]
                juti_time = publish_time[-5:]
                publish_time = year + month + day + ' ' + juti_time + ':' + '00'

        # 正文
        '''可以考虑下使用文章密度算法来快速解析文章正文'''
        a = Article(response.meta['url'], language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        # 标签
        news_tags = ''

        #图片
        image_urls1 = response.xpath('//p[@class="pi"]/img/@src').extract()
        image_urls = []
        image_names = []
        if image_urls1 != []:
            for i in range(len(image_urls1)):
                image_url = image_urls1[i]
                image_urls.append(image_url)
                if i >= 0 and i < 10:
                    image_title = news_title + '000' + str(i)
                elif i >= 10 and i < 100:
                    image_title = news_title + '00' + str(i)
                elif i >= 100 and i < 1000:
                    image_title = news_title + '0' + str(i)
                else:
                    image_title = news_title + str(i)
                image_names.append(image_title)

        yield self.getItem(id=ID,
                           news_url=news_url,
                           website_name='搜狐焦点',
                           website_block='市场',
                           news_title=news_title,
                           publish_time=publish_time,
                           news_author=news_author,
                           news_tags=news_tags,
                           news_content=news_content,
                           image_urls=image_urls,
                           image_names=image_names)
Exemple #43
0
def get_text_date(url):
    try:
        article = Article(url)
        article.download()
        if "Noticia servida automáticamente por la Agencia EFE" in article.html:
            return None, None
        article.html = re.sub(r"\n+", " ", article.html)
        article.html = re.sub(
            r"<blockquote class=\"twitter-tweet\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(
            r"<blockquote class=\"instagram-media\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(
            r"<blockquote class=\"tiktok-embed\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(r"<blockquote cite=\".+?</blockquote>", "",
                              article.html)
        #article.html = re.sub(r"<h2 class=\"mce\">&middot.+?</p>", "", article.html) # subtitulares de vertele
        article.html = re.sub(r"<figcaption.+?</figcaption>", "", article.html)
        article.html = re.sub(
            r"<p><em>Si alguien te ha reenviado esta carta.+?</em></p>", "",
            article.html)  # Matrioska de verne
        article.html = re.sub(
            r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?ante la enfermedad</a></p>",
            "", article.html)  # El Pais nuevo pie coronavirus
        article.html = re.sub(
            r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?sobre la pandemia.*?</p>",
            "", article.html)  # El Pais viejo pie coronavirus
        article.html = re.sub(r"<p class=\"\">.*?Suscríbase aquí.*?</p>", "",
                              article.html)  # newsletter El País
        article.html = re.sub(r"<a[^>]+>Apúntate a .*?</a>", "",
                              article.html)  # newsletter 20 minutos
        article.html = re.sub(r"<p[^>]+>Apúntate a .*?</p>", "",
                              article.html)  # newsletter 20 minutos
        article.html = re.sub(
            r"<span class=\"datos-articulo\".+?</div><p class=\"enviaremailerr captcha\">",
            "", article.html)
        article.html = re.sub(r"<aside class=\"modulo temas\".+?</aside>", "",
                              article.html)
        article.html = re.sub(r"Si quieres seguir recibiendo.+?</p>", "",
                              article.html)
        article.html = re.sub(r"<p class=\"siguenos_opinion\">.+?</p>", "",
                              article.html)
        article.html = re.sub(r"<p><a.+?<em>playlists</em> de EL PAÍS</a></p>",
                              "", article.html)
        article.html = re.sub(r"<section class=\"more_info .+?</section>", "",
                              article.html)
        article.html = re.sub(r"<span class=\"EPS-000.+?eps</span>", "",
                              article.html)
        article.html = re.sub(
            r"<span class=\"f_a | color_black uppercase light.+?</span>", "",
            article.html)
        article.html = re.sub(r"<i>Puedes seguir a .+?[nN]ewsletter.?</i>", "",
                              article.html)  # pie de Materia
        article.html = re.sub(r"Puedes seguir a .+?(<i>)? *[nN]ewsletter</a>",
                              "", article.html)  # pie de Materia
        article.html = re.sub(
            r"<i>Puedes seguir a .+?(<i>)? *[nN]ewsletter</i></a>", "",
            article.html)  # pie de Materia
        article.html = re.sub(
            r"<i>Puedes escribirnos a .+?[Nn]ewsletter</i></a>", "",
            article.html)  # pie de Materia nuevo
        article.html = re.sub(r"<p><em><strong>¿Nos ayudas?.+?</p>", "",
                              article.html)  # Kiko Llaneras
        article.html = re.sub(
            r"<p class=\"nota_pie\".+?a nuestra <em>newsletter</em>\.?(</span>)*</p>",
            "", article.html)  # pie de Planeta Futuro
        article.html = re.sub(
            r"<i>Puedes escribirnos a.+?<i>[nN]ewsletter</i></a>", "",
            article.html)  # pie de Materia
        article.html = re.sub(r"<p class="
                              "><i>Puedes escribirnos a.+?</p>", "",
                              article.html)
        article.html = re.sub(
            r"<i>Lee este y otros reportajes.+?con EL PAÍS.</i>", "",
            article.html)  # pie Buenavida EL PAIS
        article.html = re.sub(
            r"<h3 class=\"title-related\">.+?</div>", "",
            article.html)  # noticias relacionadas en El Confi
        article.html = re.sub(
            r"<button.+?</button>", "",
            article.html)  # botones de compartir en elpais icon
        article.html = re.sub(r"<p class=\"g-pstyle.+?</p>", "", article.html)
        article.html = re.sub(r"<p class=\"nota_pie\">.+?</p>", "",
                              article.html)
        article.html = re.sub(r"<strong>Apúntate a la .+?</strong>", "",
                              article.html)
        article.html = re.sub(r"<p><strong>O súmate a .+?</strong></p>", "",
                              article.html)
        #article.html = re.sub(r"<h2.*?>¿En qué se basa todo esto\?</h2>.*</div>", "", article.html)
        article.html = re.sub(
            r"<strong>M&aacute;s en tu mejor yo</strong>: <a.*?</a>", "",
            article.html)
        article.html = re.sub(r"<p class=\"article-text\"> +<a.*?</a>", "",
                              article.html)
        article.html = re.sub(
            r"<span>Este sitio web utiliza cookies propias.+?</span>", "",
            article.html)
        article.html = re.sub(r"\[LEER MÁS:.+?\]", "", article.html)
        article.html = re.sub(r"<div id=\"post-ratings-.+?Cargando…</div>", "",
                              article.html)  # rating EFE
        article.html = re.sub(
            r"<div id=\"div_guia\" class=\"guia\" itemprop=\"alternativeHeadline\">.+?</div>",
            "", article.html)  # subtitulo EFE
        article.html = re.sub(
            r"<div class=\"f f__v video_player.+?</div></div></div>", "",
            article.html)
        article.html = article.html.replace("<em class=\"mce\">", "<em>")
        article.html = re.sub("([^ ])<em>", "\g<1> <em>", article.html)
        article.html = article.html.replace("<em> ", "<em>")
        article.html = re.sub("([^ ])<i>", "\g<1> <i>", article.html)
        article.html = article.html.replace("<i> ", "<i>")
        article.html = article.html.replace(" </em>", "</em>")
        #article.html = re.sub("</em>([^ \W])", "</em> \g<1>", article.html)
        article.html = re.sub("</em>([^\s\.,;:])", "</em> \g<1>", article.html)
        article.html = article.html.replace(" </i>", "</i>")
        article.html = re.sub("</i>([^\s\.,;:])", "</i> \g<1>", article.html)
        article.html = article.html.replace("<em>", "'")
        article.html = article.html.replace("</em>", "'")
        article.html = article.html.replace("<i>", "'")
        article.html = article.html.replace("</i>", "'")
        article.parse()
        """
		if article.meta_description:
			article.text = article.meta_description + "\n\n" + article.text
		"""
        return article.text, article.publish_date
    except newspaper.article.ArticleException:
        return None, None
Exemple #44
0
class PageReaderBase:
    def __init__(self, url, lang="en"):
        self.url = url
        self.lang = lang
        self.article = None

        self.text_property = "text"
        self.title_property = "title"
        self.authors_property = "authors"
        self.publish_date_property = "publish_date"
        self.html_property = "raw_html"
        self.dom_property = "doc"

    def _read(self):
        if self.article is None:
            if self.lang is None:
                self.article = Article(self.url)
            else:
                self.article = Article(self.url, language=self.lang)
            try:
                self.article.download()
                self.article.parse()
            except:
                logger.info(
                    "failed when loading article content for {}\nError: {}".
                    format(self.url, traceback.format_exc()))
        return self.article

    def _get(self, key):
        article = self._read()
        if article is None:
            return None
        data = article.__getattribute__(key)
        return data

    def main_text(self):
        text = self._get(self.text_property)
        if len(text) == 0:
            logger.info("No content has been fetched for {}".format(self.url))
            return None
        return text

    def title(self):
        return self._get(self.title_property)

    def authors(self):
        authors = self._get(self.authors_property)
        if authors is None:
            authors = []
        site_authors = read_site_authors(self.url,
                                         self._get(self.dom_property))
        authors.extend(site_authors)
        return authors

    def publish_date(self):
        return self._get(self.publish_date_property)

    def html(self):
        return self._get(self.html_property)

    def page_title(self):
        dom_tree = self._get(self.dom_property)
        if dom_tree is not None:
            title = dom_tree.findtext(".//title")
            return title or ""
        return ""
Exemple #45
0
def get_bot_response():
    while True:
        userText = request.args.get('msg')
        msg = str(userText)
        entrada = msg.lower()
        f = csv.writer(open('inputs.csv', 'a', encoding='utf-8'))
        f.writerow([msg])
        response = searchbot.get_response(userText)
        if float(response.confidence) >= 0.7:
            return str(searchbot.get_response(userText))
        elif userText == str('NÃO'):
            return str('Refaça a pergunta, por favor!')
        elif userText == str("SIM"):
            return str("Agradecemos o seu contato")
        elif float(response.confidence) == 0.0:
            entrada = msg
            # print(entrada)
            p1 = 'http://receita.economia.gov.br/@@busca?advanced_search=False&sort_on=&SearchableText='
            p2 = '&portal_type%3Alist=Document&created.query%3Arecord%3Alist%3Adate=1970-01-02&created.range%3Arecord=min'
            html = str(p1 + entrada + p2)
            stop2 = nltk.corpus.stopwords.words('portuguese')
            stop2.append('faço')
            stop2.append('um')
            stop2.append('gostaria')
            stop2.append('fazer')
            stop2.append('saber')
            stop2.append('posso')
            stop2.append('como')
            splitter = re.compile('\\W+')

            lista_palavras = []
            lista = [p for p in splitter.split(entrada) if p != '']
            for p in lista:
                if p not in stop2:
                    if len(p) > 1:
                        lista_palavras.append(p)
            ar = len(lista_palavras)
            ax = str(lista_palavras[0:ar])
            e = str(ax).replace(',', ' ').strip('[]')
            e.strip("'")
            headers = {'User-Agent': 'Mozilla/5.0'}
            page = requests.get(html, headers=headers, verify=False, stream=False, timeout=5)
            soup = BeautifulSoup(page.content, 'lxml')
            cla = soup.find(class_='searchResults')
            links = cla.find_all('a')
            # namess = soup.find_all('a')
            # ra = (lista_palavras)
            # CRIAR A LISTA DE LINKS SITE RFB
            listr = []
            for link in links:
                texto = str(link.get_text()).lower().replace('ã', 'a').replace('-', ' ').replace('ç', 'c').split()
                # print(len(texto))
                url = str(link.get('href'))
                # print(len(url))
                urls = str(link.get('href')).lower().replace('/', ' ').replace('-', ' ').replace('.', ' ').split()
                # print(len(urls))
                if entrada in texto:
                    listr.append(url)
                for i in range(0, ar):
                    if lista_palavras[i] in texto:
                        listr.append(url)
                    elif lista_palavras[i] in urls:
                        listr.append(url)

            listag = []
            rec = 'site:receita.economia.gov.br intitle:' + msg + " -filetype:pdf -.pdf"
            for urla in search(rec, tld='com.br', lang='pt-br', stop=4, pause=5):
                listag.append(urla)

            g = int(len(listag))
            # print(g)

            listago = []
            for z in range(0, g):
                ur = str(listag[z])
                listago.append(ur)

            # print(listago)
            # print(len(listago))
            qo = int(len(listago))
            # print(listr)
            # print(len(listr))
            listaunida = listago + listr
            conj = list(set(listaunida))
            # print(conj)
            # print(len(conj))
            # print(type(conj))

            # print(p)
            # print(len(p))
            j = len(conj)

            reports2 = []
            # news_pool.set(reports2)#, threads_per_source=2)
            # news_pool.join()
            for r in range(0, j):

                try:
                    ia = str(conj[r])
                    article = Article(ia, language="pt")
                    article.download()
                    article.parse()
                    article.text
                    article.nlp()
                    article.summary
                except:
                    pass
                reports2.append(str(article.summary).replace('\n', ' '))
            # print(len(reports2))

            resposta_finalc = set(reports2)
            print(resposta_finalc)

            if resposta_finalc == set():
                wikipedia.set_lang("pt")
                a = msg
                result = wikipedia.search(a, results=1)
                page = wikipedia.summary(result, sentences=5)
                content = page
                return str(content)
            else:
                resposta_final = (
                    str(resposta_finalc).replace('\n', ' ').replace('[', ' ').replace(']', ' ').replace(',',
                                                                                                        ' ').replace(
                        "'", ' ').replace('{', ' ').replace("}", ' '))

                f = csv.writer(open('chats.csv', 'a', encoding='utf-8'))
                f.writerow([msg + '\n' + resposta_final])
                return str(resposta_final + '\n' + 'Ficou satisfeito com a resposta? SIM ou NÃO?')
Exemple #46
0
def article(title):
    global articlePageList
    global articlePageListRec
    global firstTime

    neededUrl = ''
    neededImgUrl = ''
    indexOfArticleCategory = 0
    flag = 0
    flag2 = 0
    pagesize = 5
    if flag2 == 0:
        for articleList in articlePageList:
            for item in articleList:
                if item['title'] == title:
                    neededUrl = item['url']
                    neededImgUrl = item['urlToImage']
                    flag = 1
                    flag2 = 1
                    break
            if flag == 1:
                break
            indexOfArticleCategory += 1
            print(indexOfArticleCategory)
    #nltk.download('punkt')
    if flag2 == 0:
        indexOfArticleCategory = 0
        for articleList in articlePageListRec:
            for item in articleList:
                if item['title'] == title:
                    neededUrl = item['url']
                    neededImgUrl = item['urlToImage']
                    flag = 1
                    flag2 = 0
                    pagesize = 3
                    break
            if flag == 1:
                articlePageList = articlePageListRec
                break
            indexOfArticleCategory += 1
            print(indexOfArticleCategory)

    url = neededUrl
    article = Article(url)
    article.download()
    try:
        article.parse()
    except:
        neededImgUrl = "notPresent"

    article.nlp()
    summary = article.summary
    movies = article.movies
    publishDate = article.publish_date
    if publishDate != None:
        dateStr = publishDate.strftime('%d, %B %Y')
    else:
        dateStr = '-'

    if movies == []:
        movies = ''

    if neededImgUrl == None:
        neededImgUrl = "notPresent"

    ### Recommendations ###
    listofpreff = []
    articlePageListRec = []
    global zipper
    if firstTime == 1:
        if session:
            uid = session['uid']

            connection = pymysql.connect(host='localhost',
                                         user='******',
                                         password='',
                                         db='allinonenews')
            with connection.cursor(pymysql.cursors.DictCursor) as cur:
                sql = "SELECT * FROM prefferences WHERE id = %s"
                result = cur.execute(sql, (uid))
            connection.commit()

            if result > 0:
                # Get stored hash
                preff = cur.fetchall()
                for i in preff:
                    listofpreff = listofpreff + [i['category']]

                for prefference in listofpreff:
                    url = 'https://newsapi.org/v2/everything?language=en&pageSize=3&page=1&q=' + prefference + '&apiKey=097f0f6fb89b43539cbaa31372c3f92d'
                    r = requests.get(url)
                    articlePageListRec.append(r.json()['articles'])
            cur.close()
    zipper = zip(articlePageListRec, listofpreff)

    return render_template('article.html',
                           summary=summary,
                           title=title,
                           index=indexOfArticleCategory,
                           neededImgUrl=neededImgUrl,
                           movies=movies,
                           date=dateStr,
                           articleUrl=url,
                           jso=articlePageList,
                           zipper=zipper,
                           pagesize=pagesize)
Exemple #47
0
def scrapeAnalyse(url, isGeneral, keywords):
    nltk.download('punkt')
    if (isGeneral):
        all_data = []
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }

        # ------ Google News ------
        response = requests.get("https://news.google.com/search?q=" + keywords,
                                headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        for index, link in enumerate(
                soup.findAll('div', attrs={'class': 'NiLAwe'})):
            if index >= 5:
                break
            children = link.findChildren('a', recursive=False)
            for child in children:
                news_url = child.get('href')
                article = Article("https://www.news.google.com" + news_url[1:])
                article.download()
                article.parse()
                date = None
                if article.publish_date == None:
                    date = time.time()
                else:
                    date = article.publish_date.timestamp()
                el = {
                    "text": article.text,
                    "date": date,
                    "url": "https://www.news.google.com" + news_url[1:]
                }
                all_data.append(el)

        # ------ Yahoo News ------
        # response = requests.get("https://news.search.yahoo.com/search?p=" + keywords, headers=headers)
        # soup = BeautifulSoup(response.content, "html.parser")

        # for index, link in enumerate(soup.findAll('h4', attrs={'class':'fz-16 lh-20'})):
        #     if index >= 0:
        #         break
        #     children = link.findChildren('a', recursive=False)
        #     for child in children:
        #         news_url = re.sub("\/RV=2.*", "", child.get('href'))
        #         article = Article(news_url)
        #         article.download()
        #         article.parse()
        #         el = {"text": article.text, "date": article.publish_date, "url": news_url}
        #         all_data.append(el)

        # ------ Bing News ------
        response = requests.get("https://www.bing.com/news/search?q=" +
                                keywords,
                                headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        for index, link in enumerate(
                soup.findAll('div',
                             attrs={'class':
                                    'news-card newsitem cardcommon'})):
            if index >= 5:
                break
            news_url = link.get('url')
            article = Article(news_url)
            article.download()
            article.parse()
            date = None
            if article.publish_date == None:
                date = time.time()
            else:
                date = article.publish_date.timestamp()
            el = {"text": article.text, "date": date, "url": news_url}
            all_data.append(el)

        # all_text = "".join(all_text)
        # all_text = "".join(x for x in all_text if x in printable)
        return all_data

    else:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        keywords = article.keywords
        date = None
        if article.publish_date == None:
            date = time.time()
        else:
            date = article.publish_date.timestamp()
        return (article.text, "+".join(keywords), date)
def __get_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article
Exemple #49
0
 def get_sentiment(self,start='2019-03-29',end=datetime.now().strftime("%Y-%m-%d")):
     """Gets daily sentiment using news sources from Google News for the specified time range"""
     start=datetime.strptime(start,"%Y-%m-%d").strftime("%m/%d/%Y")                                                        # puts start time in GoogleNews format
     end=datetime.strptime(end,"%Y-%m-%d").strftime("%m/%d/%Y")                                                            # puts end time in GoogleNews format
     googlenews=GoogleNews(lang='en',start=start,end=end,encode='utf-8')                                                   # creating object for collecting news
     googlenews.search(tick_map[self.ticker[0]])                                                                           # specifying the company
     
     
     # Getting Google Results
     for i in range(1,50):
         googlenews.getpage(i)                                                                                             # loops through google pages
         result=googlenews.result()                                                                                        # stores results
         df=pd.DataFrame(result)                                                                                           # appends results to DataFrame
     df.drop_duplicates(['link'],keep='first',inplace=True)                                                                # removes duplicate articles via links
     
     
     # Collecting Text From Articles
     L=[]                                                                                                                  # initializing empty list
     for ind in df.index:
         try:                                                                                                              # "try" for forbidden websites
             D={}                                                                                                          # initializing the dictionary
             article = Article(df['link'][ind])                                                                            # extracting information from articles
             article.download()
             article.parse()
             article.nlp()
             D['Date']=df['datetime'][ind]                                                                                 # storing information from articles
             D['Media']=df['media'][ind]
             D['Title']=article.title
             D['Article']=article.text
             D['Summary']=article.summary
             L.append(D)                                                                                                   # appending results to list
         except:
             pass
     news_df=pd.DataFrame(L)                                                                                               # make DataFrame from list
     #Preliminary Cleaning
     news_df1=news_df.dropna(axis=0)                                                                                       # dropping old "date" column
     news_df2=news_df1[news_df1['Media']!=""].set_index('Date').sort_index(ascending=True)                                 # remove articles with no media source
     news_df2=news_df2[news_df2['Article'].values!=""]                                                                     # remove articles with no content
     # Making time format %Y-%m-%d and Additional Cleaning
     new_time_format=list(pd.Series(news_df2.index).apply(lambda DATE :DATE.strftime("%Y-%m-%d")).values)                  # string form of new time format   
     new_time_format=[datetime.strptime(DATE,"%Y-%m-%d") for DATE in new_time_format]                                      # datetime form of new time format
     news_df2.index=new_time_format                                                                                        # apply new time format
     news_df2.drop(columns=['Summary','Title'],inplace=True)                                                               # dropping columns
     news_df2=Generic_Parser_Mod.LM_sentiment(news_df2)                                                                    # DataFrame of sentiment scores
     
     
     # Handling of Duplicated Entries
     duplicate_index=news_df2.index[news_df2.index.duplicated()]                                                           # identify duplicate time entries
     collapsed_dates=list(duplicate_index.unique())                                                                        # collapsing duplicate dates
     news_df3=[news_df2.loc[collapsed_dates[i]].median() for i in range(len(collapsed_dates))]                             # collapsing info in duplicate entries
     news_df3=pd.DataFrame(news_df3)                                                                                       # DataFrame of collapsed info
     news_df3.index=collapsed_dates                                                                                        # new collapsed info
     
     
     #Making new DataFrame without Duplicates
     news=news_df2.loc[[news_df2.index[i] not in duplicate_index for i in range(len(news_df2.index))]].append(news_df3,sort=False)
     
     
     # Post-Cleaning, due to unstable nature of API
     news=news.loc[start:end]                                                                                              # only articles from selected period
     news.sort_index(ascending=True,inplace=True)                                                                          # order by date
     news.to_csv(f"Sentiment_Data/{self.ticker[0]}_scores.csv",index='date')                                         # storing the sentiment data
     return news                                                                                                           # return sentiment scores
Exemple #50
0
 def set_text(self):
     if not self.text and self.url:
         a = Article(self.url)
         a.download()
         a.parse()
         self.text = a.text
Exemple #51
0
class ArticleTestCase(unittest.TestCase):
    def setup_stage(self, stage_name):
        stages = OrderedDict([
            ('initial', lambda: None),
            ('download', lambda: self.article.download(
                mock_resource_with('cnn_article', 'html'))),
            ('parse', lambda: self.article.parse()),
            ('meta', lambda: None),  # Alias for nlp
            ('nlp', lambda: self.article.nlp())
        ])
        assert stage_name in stages
        for name, action in stages.items():
            if name == stage_name:
                break
            action()

    def setUp(self):
        """Called before the first test case of this unit begins
        """
        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
                'thanksgiving/index.html?iref=allsearch')

    @print_test
    def test_url(self):
        self.assertEqual(
            'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch',
            self.article.url)

    @print_test
    def test_download_html(self):
        self.setup_stage('download')
        html = mock_resource_with('cnn_article', 'html')
        self.article.download(html)
        self.assertEqual(75406, len(self.article.html))

    @print_test
    def test_meta_refresh_redirect(self):
        # TODO: We actually hit example.com in this unit test ... which is bad
        # Figure out how to mock an actual redirect
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article(
            '', config=config)
        html = mock_resource_with('google_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'Example Domain')

    @print_test
    def test_meta_refresh_no_url_redirect(self):
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article(
            '', config=config)
        html = mock_resource_with('ap_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'News from The Associated Press')

    @print_test
    def test_pre_download_parse(self):
        """Calling `parse()` before `download()` should yield an error
        """
        article = Article(self.article.url)
        self.assertRaises(ArticleException, article.parse)

    @print_test
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
                   'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.assertEqual(text, self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))

    @print_test
    def test_meta_type_extraction(self):
        self.setup_stage('meta')
        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        self.assertEqual('article', meta_type)

    @print_test
    def test_meta_extraction(self):
        self.setup_stage('meta')
        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(dict, {
            'medium': 'news',
            'googlebot': 'noarchive',
            'pubdate': '2013-11-27T08:36:32Z',
            'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
            'og': {'site_name': 'CNN',
                   'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.',
                   'title': 'After storm, forecasters see smooth sailing for Thanksgiving',
                   'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                   'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg',
                   'type': 'article'},
            'section': 'travel',
            'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN',
            'robots': 'index,follow',
            'vr': {
                'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'},
            'source': 'CNN',
            'fb': {'page_id': 18793419640, 'app_id': 80401312489},
            'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
            'article': {
                'publisher': 'https://www.facebook.com/cnninternational'},
            'lastmod': '2013-11-28T02:03:23Z',
            'twitter': {'site': {'identifier': '@CNNI', 'id': 2097571},
                        'card': 'summary',
                        'creator': {'identifier': '@cnntravel',
                                    'id': 174377718}},
            'viewport': 'width=1024',
            'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
        })

        self.assertDictEqual(META_DATA, meta)

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = [v for v in list(meta.values()) if isinstance(v, dict)]
        self.assertTrue(all([len(d) > 0 for d in dict_values]))

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        self.assertEqual(5, len([i for i in meta.values() if is_dict(i)]))

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, str)
        self.assertEqual(12, len([i for i in meta.values() if is_string(i)]))

    @print_test
    def test_pre_download_nlp(self):
        """Test running NLP algos before even downloading the article
        """
        self.setup_stage('initial')
        new_article = Article(self.article.url)
        self.assertRaises(ArticleException, new_article.nlp)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLP algos before parsing the article
        """
        self.setup_stage('parse')
        self.assertRaises(ArticleException, self.article.nlp)

    @print_test
    def test_nlp_body(self):
        self.setup_stage('nlp')
        self.article.nlp()
        KEYWORDS = ['balloons', 'delays', 'flight', 'forecasters',
                    'good', 'sailing', 'smooth', 'storm', 'thanksgiving',
                    'travel', 'weather', 'winds', 'york']
        SUMMARY = mock_resource_with('cnn_summary', 'txt')
        self.assertEqual(SUMMARY, self.article.summary)
        self.assertCountEqual(KEYWORDS, self.article.keywords)
def get_Table_html(dataframe,
                   recent_articles=None,
                   titles_show=None,
                   max_rows=10,
                   styling=None):
    """
    building html table with appropriate formatting and styling for both authors and content suggestions
    """
    rows = []
    for i in range(min(len(dataframe), max_rows)):
        row = []
        for col in dataframe.columns:
            if (col == 'Author_wn') or (col == 'Similarity (0-10)'):
                continue
            value = dataframe.iloc[i][col]
            # update this depending on which
            # columns you want to show links for
            # and what you want those links to be
            if col == 'Suggested articles':
                try:
                    if titles_show != None:
                        title_curr = titles_show[value]
                        if not isinstance(title_curr, str):
                            article = Article(value)
                            article.download()
                            article.parse()
                            title_curr = article.title
                    else:
                        article = Article(value)
                        article.download()
                        article.parse()
                        title_curr = article.title
                    cell = html.Td(
                        html.A(
                            href=value,
                            children=title_curr,
                            target='TargetArticle',
                        ))
                    # style={'color':'white', 'textDecoration': 'underline'}))
                except:
                    cell = html.Td(children=value)
                    print(value)
            elif col == 'Authors':
                try:
                    path_link = recent_articles[dataframe.iloc[i]
                                                ['Author_wn']]['links'][0]
                    # path_link = 'https://muckrack.com/' + dataframe.iloc[i]['Author_wn']
                    cell = html.Td(
                        html.A(
                            href=path_link,
                            children=value,
                            target='TargetArticle',
                        ))
                    # style={'color':'white', 'textDecoration': 'underline'}))
                except:
                    cell = html.Td(children=value)
                    print(value)
            else:
                cell = html.Td(children=value)
            row.append(cell)
        rows.append(html.Tr(row))
    return html.Table(
        # Header
        # [html.Tr([html.Th(col) for col in dataframe.columns if (col == 'Author_wn') or (col == 'Similarity (0-10)')])] +
        rows,
        style=styling)
def Import_Data(url):
    """Using Newspaper.py, fetches body text from the given URL, returning said text, as well as the article object"""
    a = Article(url)
    a.download()
    a.parse()
    return a.text, a
Exemple #54
0
def get_article():
    url = None

    url = request.args.get('url', type=str)

    if url == None:
        return 'url parameter is required', 400

    article = Article(url)
    article.download()

    if (article.download_state == 2):
        article.parse()
        article_dict = {}
        article_dict['status'] = 'ok'

        article_dict['article'] = {}
        article_dict['article']['source_url'] = article.source_url

        try:
            guess = guess_date(url=url, html=article.html)
            article_dict['article']['published'] = guess.date
            article_dict['article']['published_method_found'] = guess.method
            article_dict['article']['published_guess_accuracy'] = None
            if guess.accuracy is Accuracy.PARTIAL:
                article_dict['article']['published_guess_accuracy'] = 'partial'
            if guess.accuracy is Accuracy.DATE:
                article_dict['article']['published_guess_accuracy'] = 'date'
            if guess.accuracy is Accuracy.DATETIME:
                article_dict['article'][
                    'published_guess_accuracy'] = 'datetime'
            if guess.accuracy is Accuracy.NONE:
                article_dict['article']['published_guess_accuracy'] = None
        except:
            article_dict['article']['published'] = article.publish_date
            article_dict['article']['published_method_found'] = None
            article_dict['article']['published_guess_accuracy'] = None

        article_dict['article']['title'] = article.title
        article_dict['article']['text'] = article.text
        article_dict['article']['authors'] = list(article.authors)

        try:
            title_lang = detect(article.title)
        except:
            title_lang = None

        try:
            text_lang = detect(article.text)
        except:
            text_lang = None

        article_dict['article']['images'] = list(article.images)
        article_dict['article']['top_image'] = article.top_image
        article_dict['article']['meta_image'] = article.meta_img
        article_dict['article']['movies'] = list(article.movies)
        article_dict['article']['meta_keywords'] = list(article.meta_keywords)
        article_dict['article']['tags'] = list(article.tags)
        article_dict['article']['meta_description'] = article.meta_description
        article_dict['article']['meta_lang'] = article.meta_lang
        article_dict['article']['title_lang'] = str(title_lang)
        article_dict['article']['text_lang'] = str(text_lang)
        article_dict['article']['meta_favicon'] = article.meta_favicon
        return jsonify(article_dict)

    else:
        article_dict = {}
        article_dict['status'] = 'error'
        article_dict['article'] = article.download_exception_msg
        return jsonify(article_dict)
Exemple #55
0
class ArticleTestCase(unittest.TestCase):
    def runTest(self):
        self.test_url()
        self.test_download_html()
        self.test_pre_download_parse()
        self.test_parse_html()
        self.test_meta_type_extraction()
        self.test_meta_extraction()
        self.test_pre_download_nlp()
        self.test_pre_parse_nlp()
        self.test_nlp_body()

    def setUp(self):
        """Called before the first test case of this unit begins
        """
        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch')

    def tearDown(self):
        """Called after all cases have been completed, intended to
        free resources and etc
        """
        pass

    @print_test
    def test_url(self):
        assert self.article.url == (
            'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch')

    @print_test
    def test_download_html(self):
        html = mock_resource_with('cnn_article', 'html')
        self.article.download(html)
        assert len(self.article.html) == 75175

    @print_test
    def test_pre_download_parse(self):
        """Calling `parse()` before `download()` should yield an error
        """
        article = Article(self.article.url)
        self.assertRaises(ArticleException, article.parse)

    @print_test
    def test_parse_html(self):
        AUTHORS = ['Dana Ford', 'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        assert self.article.text == text

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        assert self.article.top_img == TOP_IMG

        assert sorted(self.article.authors) == AUTHORS
        assert self.article.title == TITLE
        assert len(self.article.imgs) == LEN_IMGS
        assert self.article.meta_lang == META_LANG

    @print_test
    def test_meta_type_extraction(self):
        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        assert 'article' == meta_type

    @print_test
    def test_meta_extraction(self):
        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(
            dict, {
                'medium':
                'news',
                'googlebot':
                'noarchive',
                'pubdate':
                '2013-11-27T08:36:32Z',
                'title':
                'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
                'og': {
                    'site_name': 'CNN',
                    'description':
                    'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.',
                    'title':
                    'After storm, forecasters see smooth sailing for Thanksgiving',
                    'url':
                    'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                    'image':
                    'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg',
                    'type': 'article'
                },
                'section':
                'travel',
                'author':
                'Dana Ford and Tom Watkins, CNN',
                'robots':
                'index,follow',
                'vr': {
                    'canonical':
                    'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'
                },
                'source':
                'CNN',
                'fb': {
                    'page_id': 18793419640,
                    'app_id': 80401312489
                },
                'keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
                'article': {
                    'publisher': 'https://www.facebook.com/cnninternational'
                },
                'lastmod':
                '2013-11-28T02:03:23Z',
                'twitter': {
                    'site': {
                        'identifier': '@CNNI',
                        'id': 2097571
                    },
                    'card': 'summary',
                    'creator': {
                        'identifier': '@cnntravel',
                        'id': 174377718
                    }
                },
                'viewport':
                'width=1024',
                'news_keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
            })

        assert meta == META_DATA

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = [v for v in list(meta.values()) if isinstance(v, dict)]
        assert all([len(d) > 0 for d in dict_values])

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        assert len(list(filter(is_dict, list(meta.values())))) == 5

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, str)
        assert len(list(filter(is_string, list(meta.values())))) == 12

    @print_test
    def test_pre_download_nlp(self):
        """Test running NLP algos before even downloading the article
        """
        new_article = Article(self.article.url)
        self.assertRaises(ArticleException, new_article.nlp)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLP algos before parsing the article
        """
        new_article = Article(self.article.url)
        html = mock_resource_with('cnn_article', 'html')
        new_article.download(html)
        self.assertRaises(ArticleException, new_article.nlp)

    @print_test
    def test_nlp_body(self):
        KEYWORDS = [
            'balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing',
            'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds',
            'york'
        ]
        SUMMARY = mock_resource_with('cnn_summary', 'txt')
        assert self.article.summary == SUMMARY
        assert sorted(self.article.keywords) == sorted(KEYWORDS)
###### Contéudo Post Ontem ######

# Importando bibliotecas
from newspaper import Article

# Recebendo url artigo
url = 'https://edition.cnn.com/2021/04/24/politics/inequality-biden-100-days/index.html'

# Definindo classe artigo para variável
artigo = Article(url)

artigo.download()  # Baixa o artigo
artigo.parse()  # Raspa o site, buscando artigo
artigo.nlp()  # Separa o sumário & Palavras Chaves

# Recebe o conteúdo do Titulo
titulo = artigo.title
# TEXTO DO TITULO

# Recebe o conteúdo do Sumário
conteudo_texto = artigo.summary
# TEXTO DO SUMÁRIO

###### Contéudo Post Ontem ######

###### Contéudo Post Hoje ######

# instalando bibliotecas
# pip install -U textblob
# python -m textblob.download_corpora
Exemple #57
0
def get_news(message):
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as p

    #Cleaning the texts
    import re
    import nltk
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    from nltk.stem import WordNetLemmatizer
    from textblob import TextBlob
    lemmatizer = WordNetLemmatizer()
    corpus=[]
    bow=[]
    nltk.download('wordnet')
    review=re.sub('[^a-zA-Z]', ' ', message)
    review=review.lower()#converts all characters to lowercase
    review=review.split()#splits the sentence into a list
    lemmatizer = WordNetLemmatizer()
    review=[lemmatizer.lemmatize(word,pos="v") for word in review if not word in set(stopwords.words('english'))]# removal of stopwords
    review=' '.join(review)#converting the list back into a sentence
    corpus.append(review)#creating a list of sentences
    bow.append(review.split(" "))#creating a list of words in each sentences and storing it in a list
    bowa=review.split()
    bowb=set(bowa)
    worddict=dict.fromkeys(bowb,0)


    #SENTIMENT ANALYSIS
    
    def clean_text(inp):
        '''
        Utility function to clean text by removing links, special characters
        using simple regex statements.
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", inp).split())

    def get_text_sentiment(inp):
            '''
            Utility function to classify sentiment of passed text
            using textblob's sentiment method
            '''
            analysis = TextBlob(clean_text(inp))
            if analysis.sentiment.polarity > 0:
                return 'positive'
            elif analysis.sentiment.polarity == 0:
                return 'neutral'
            else:
                return 'negative'


    def get_texts(inp):
            
            text_sentl=[]
            for t in inp:
                text_sent={}
                text_sent['text'] = t
                text_sent['sentiment'] = get_text_sentiment(t)
                text_sentl.append(text_sent)
            return text_sentl



    
    #finding the frequency of words in each sentence
    for word in bowa:
        worddict[word]+=1

    #computing the term frequency
    def computeTF(wordDict, bow):
        tfDict = {}
        bowCount = len(bow)
        for word, count in wordDict.items():
            tfDict[word] = count/float(bowCount)
        return tfDict
    tfBowA = computeTF(worddict, bowa)
    from collections import Counter
    # Initial Dictionary 
    k = Counter(tfBowA) 
    # Finding 3 highest values 
    high = k.most_common(10)  
    #print(high,"\n")
    sentence=[]
    for i in high: 
        sentence.append(i[0])


    def get_cosine_sim(*strs): 
        vectors = [t for t in get_vectors(*strs)]
        d1=np.array([vectors[0]])
        d2=np.array([vectors[1]])
        return cosine_similarity(d1,d2)
    
    def get_vectors(*strs):
        text = [t for t in strs]
        vectorizer = CountVectorizer(text)
        vectorizer.fit(text)
        return vectorizer.transform(text).toarray()

    
    
    
    #SCRAPING
    from googlesearch import search
    from newspaper import Article
    links=list()
    sentence=' '.join(sentence)
    query =sentence
    print(query)
    for j in search(query, tld="com", num=10, start=0, stop=10, pause=2.0): 
        #print(j)
        links.append(j)
    global pos
    global neg
    global nu

    #GETS THE ARTICLES FROM THEIR LINKS

    flag=0
    for k in links:
        if((k[:20]=="https://timesofindia") | (k[:18]=="https://www.news18") | (k[:26]=="https://www.hindustantimes") | (k[:21]=="https://indianexpress")\
                                           | (k[:20]=="https://www.livemint") | (k[:21]=="https://economictimes")\
                                           | (k[:22]=="https://www.indiatoday") | (k[:20]=="https://gadgets.ndtv")\
                                           | (k[:24]=="https://www.timesnownews") | (k[:19]=="https://edition.cnn")\
                                           | (k[:15]=="https://www.bbc") | ("washingtonpost" in k) | ("theguardian" in k) | ("news.com.au" in k)\
                                           | ("abc.net.au" in k) | ("www.nytimes" in k) | ("www.bloomberg" in k) | ("www.dailymail" in k)\
                                           | ("www.newyorker" in k) | ("www.mirror.co" in k) | ("www.telegraph.co" in k) | ("news.sky" in k) | ("wikipedia.org" in k)):
            #A new article from TOI
            url=k
            #For different language newspaper refer above table 
            article = Article(url, language="en") # en for English 
  
            #To download the article 
            article.download() 
 
            #To parse the article 
            article.parse()

            #To perform natural language processing ie..nlp 
            article.nlp()


            #CHECKING SENTIMENT
            temp=(article.text).split('\n')
            file=open(r"C:\Users\Saksham\Desktop\article.txt","a+",encoding="utf-8")
            file.writelines(temp)
            file=open(r"C:\Users\Saksham\Desktop\article.txt","r",encoding="utf-8")
            t=file.read()
            text=[t]
            textinp=get_texts(text)
            for i in textinp:
                print(i['sentiment'])
                if(i['sentiment']=="positive"):
                    pos=pos+1
                elif(i['sentiment']=="negative"):
                    neg=neg+1
                else:
                    nu=nu+1;
            file=open(r"C:\Users\Saksham\Desktop\article.txt","w",encoding="utf-8")

            #FINDING THE COSSIM VALUE
            message2=article.text
            from sklearn.feature_extraction.text import CountVectorizer
            from sklearn.metrics.pairwise import cosine_similarity
            cossim=get_cosine_sim(message,message2)
            if(cossim<0.75):
                lines=message2.split('.')
                for line in lines:
                    cossim=get_cosine_sim(message,line)
                    cossim=cossim[0][0]
                    if(cossim>0.75 or cossim>0.4):
                        break
            
    if(pos>neg and pos>nu):
        sent="positive"
    elif(neg>pos and neg>nu):
        sent="negative"
    else:
        sent="neutral"


    
    if(cossim>=0.6):
        label['text']="It is true and similarity co-efficient is:",str(cossim),"sentiment is ",sent
    elif(cossim<0.6 and cossim>0.4):
        label['text']="Data is insufficient",str(cossim),"sentiment is ",sent
    else:
        label['text']="It is false and similarity co-efficient is:",str(cossim),"sentiment is ",sent
Exemple #58
0
def extract_headlines_news(code, headlines_links, cursor):
    '''抽取yahoo的新闻链接并解析'''

    in_sql = """ INSERT INTO yahoo_comp_news(url_md5,url,code_name,newspaper_title,newspaper_text,
newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article,
boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,
boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,count)
VALUES
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1)
on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s,
newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s,
boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s,
boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s,
up_time = now(),count=count+1 """

    for link in headlines_links:
        #长度小于35的url一般都不是新闻连接
        if link is None or len(link) <= 35:
            continue
        try:
            url_md5 = hashlib.md5(link).hexdigest()
            #首先让使用newspaper
            newspaper_title = ''
            newspaper_text = ''
            newspaper_authors = ''
            newspaper_summary = ''
            newspaper_keywords = ''
            article = Article(link)
            article.download()
            html = article.html
            if html is None or len(html) == 0:
                continue
            article.parse()
            if article.text and len(article.text) > 0:
                newspaper_title = article.title
                newspaper_text = article.text
                newspaper_authors = article.authors
                if newspaper_authors and len(newspaper_authors) > 0:
                    newspaper_authors = ','.join(newspaper_authors)
                else:
                    newspaper_authors = ''

                article.nlp()
                newspaper_summary = article.summary
                newspaper_keywords = article.keywords
                if newspaper_keywords and len(newspaper_keywords) > 0:
                    newspaper_keywords = ','.join(newspaper_keywords)
                else:
                    newspaper_keywords = ''

            #然后使用boilerpipe

            extractor = Extractor(extractor='ArticleExtractor', html=html)
            boilerpipe_article = extractor.getText()

            extractor = Extractor(extractor='ArticleSentencesExtractor',
                                  html=html)
            boilerpipe_articlesentences = extractor.getText()

            extractor = Extractor(extractor='KeepEverythingExtractor',
                                  html=html)
            boilerpipe_keepeverything = extractor.getText()

            extractor = Extractor(extractor='LargestContentExtractor',
                                  html=html)
            boilerpipe_largestcontent = extractor.getText()

            extractor = Extractor(extractor='NumWordsRulesExtractor',
                                  html=html)
            boilerpipe_numwordsrules = extractor.getText()

            extractor = Extractor(extractor='CanolaExtractor', html=html)
            boilerpipe_canola = extractor.getText()

            #输入的参数
            content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\
                       boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\
                       boilerpipe_numwordsrules,boilerpipe_canola,   \
                       code, newspaper_title,newspaper_text, newspaper_authors,\
                       newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\
                       boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola)
            cursor.execute(in_sql, content)

        except:
            logger.error("crawl_page failed ,Error:%s" %
                         traceback.format_exc())
Exemple #59
0
    def calculate_summary(self, list_of_events):
        """ Returns event_data dictionary containing information for each event. 
            Return Format:
            {
             Event1: {article1 :{raw_text:"",url:"",summary:""}, article2:{raw_text:"",url:"",summary:""},... },
             Event2: {article1 :{raw_text:"",url:"",summary:""}, article2:{raw_text:"",url:"",summary:""},... },
             .....
             }

        """

        event_data = defaultdict(lambda: {})
        for event in list_of_events:
            event = self.get_event_as_string(event)
            conn = httplib.HTTPSConnection('api.cognitive.microsoft.com')
            params = self.create_params(event)
            conn.request("GET", "/bing/v5.0/news/search?%s" % params, "{body}",
                         self.headers)
            response = conn.getresponse()
            data = response.read()
            j = json.loads(data)
            count = 0
            article_data = defaultdict(lambda: {})
            for x in j['value']:
                article = Article(x['url'], language='es')
                article.download()
                article.parse()
                article_text = article.text
                temp = article_text.split('.')
                raw_article = ""
                if len(temp) > 5:
                    count += 1
                    article_txt = []
                    article_txt.append(event)
                    for sent in temp:
                        new_sent = sent.replace('\n', "")
                        raw_article = raw_article + new_sent + ". "
                        article_txt.append(new_sent)

                    vect = TfidfVectorizer(min_df=1)
                    tfidf = vect.fit_transform(article_txt)
                    mod_vect = (tfidf * tfidf.T).A
                    input_event = mod_vect[0]
                    cosine_vals = {}
                    for i in range(len(mod_vect) - 1):
                        article_sentence = mod_vect[i + 1]
                        cosine_similarity = self.calculate_cosine(
                            input_event, article_sentence)
                        cosine_vals[i + 1] = 1.0 - float(cosine_similarity)
                    sorted_list = sorted(cosine_vals.items(),
                                         key=lambda x: x[1])
                    summary = ""

                    top3 = itertools.islice(sorted_list, 3)
                    for summary_sentence in top3:
                        summary_sentence = article_txt[int(
                            summary_sentence[0])]
                        summary = summary + summary_sentence + u". "

                    data = {}
                    data["url"] = x['url']
                    data["raw_text"] = raw_article
                    data["summary"] = summary
                    article_data["article" + str(count)] = data

                event_data[event] = article_data

        #print event_data
        conn.close()
        return event_data
Exemple #60
0
    clean_string = ' '.join(clean_string.split())
    return clean_string

st.write("Good news! I found some news articles using Google News!")

# parse articles
data = {}

for google_news_article in articles:
    google_news_article = 'http://' + google_news_article
    r = requests.get(google_news_article)
    article_url = r.url

    try:
        html = Article(article_url, config=config)
        html.download()
        html.parse()
        website_text = clean_text(html.text)
        data[article_url] = website_text
    except Exception as e:
        pass
    
df = pd.DataFrame(data.items(), columns=['url','website_text'])
df.dropna(inplace=True)
df = df[df['website_text'].str.len() > 50]

st.write("I just finished reading through those articles. They seem interesting!")

# nlp pre-processing

stop_words=stopwords.words('english')+list(string.punctuation) + ['“','”','–','—','’']