Example #1
0
def parse_article(url, lang, featured=0, db=connect_db()):
    cur = db.execute("select * from articles where url=?", (url,))
    entries = [dict(id=row[0], url=row[1], title=row[2], image=row[3], text=row[4], authors=row[5], date=row[6], featured=row[7], language=row[8]) for row in cur.fetchall()]

    if len(entries) >= 1:
        return entries[0]

    article = Article(url)
    article.download()

    try:
        article.parse()
    except:
        return None

    title = article.title
    image = article.top_image
    text = article.text
    authors = ",".join(article.authors)
    date = int(time.mktime(article.publish_date.timetuple())) if type(article.publish_date) is datetime.datetime else 0

    db.execute("insert into articles (url, title, image, text, authors, date, featured, language) values (?, ?, ?, ?, ?, ?, ?, ?)", (url, title, image, text, authors, date, featured and len(text) >= 50, lang))
    db.commit()

    idquery = db.execute("select (id) from articles where url=?", (url,))
    id = [row[0] for row in idquery.fetchall()][0]

    return {"id": id, "url": url, "title": title, "image": image, "text": text, "authors": authors, "date": date, "language": lang}
Example #2
0
def get_article():
	tree_urls = ET.parse("DB_urls.xml")
	root_urls = tree_urls.getroot()

	# The problem with English and Chinese can be solved with 
	for field_urls in root_urls.findall("row"):
		url_urls = field_urls.find("field").text
	#	url_urls = 'http://news.sina.com.cn/c/2014-04-21/204729980947.shtml'
	#	url_urls = 'http://china.caixin.com/2013-12-30/100623243.html'

		try:
			response = urllib2.urlopen(url_urls)
			status = response.code

			#print "detected webpage code:", status

			if(status == 404):
				continue
			else:
				a_zh = Article(url_urls, language = 'zh')
				a_zh.download()
				a_zh.parse()
				content_urls = a_zh.text

				if(content_urls == ''):
					a_en = Article(url_urls, language = 'en')
					a_en.download()
					a_en.parse()
					content_urls = content_urls + a_en.text

				if(content_urls != ''):
					compare_article(url_urls, content_urls)			
		except:
			pass
Example #3
0
 def test_pre_parse_nlp(self):
     """Test running NLP algos before parsing the article
     """
     new_article = Article(self.article.url)
     resp = mock_response_with(new_article.url, 'cnn_article')
     new_article.download(resp)
     self.assertRaises(ArticleException, new_article.nlp)
Example #4
0
def get_details():
    url = request.args.get('url', '')
    if not url:
      abort(400)

    if is_image(url):
      result = {
        "url": url,
        "top_image": url,
        "text": "",
      }
      return jsonify(result)

    article = Article(url)
    article.download()

    try:
      article.parse()
    except (IOError, UnicodeDecodeError):
      return '', 422

    try:
      top_image = article.top_image.rsplit('?',1)[0]
    except AttributeError:
      top_image = ''

    result = {
      "url": url,
      "top_image": top_image,
      "text": article.text,
    }

    return jsonify(result)
def get_text(url):
    article = Article(url)
    download=article.download()
    parser= article.parse()
    authors=article.authors
    publish_date=article.publish_date # TODO: Slice publish date
    body_text=article.text
    body_text=body_text.replace('"','\"')
    body_text=body_text.replace('"','')
    #nlp=article.nlp()
    keywords=article.keywords
    summary=article.summary
    title=article.title
    tags=article.tags

    #print body_text

    title=strip_non_ascii(title)
    summary=strip_non_ascii(summary)
    body_text=strip_non_ascii(body_text)
    keywords=' '.join(keywords)
    keywords=strip_non_ascii(keywords)

    #print (title, summary, authors, publish_date, body_text, keywords)

    return (title, summary, authors, publish_date, body_text, keywords, tags)
Example #6
0
 def test_arabic_fulltext_extract(self):
     url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html"
     article = Article(url=url, language="ar")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Example #7
0
 def test_spanish_fulltext_extract(self):
     url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
     article = Article(url=url, language="es")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Example #8
0
    def run(self):
        logging.debug("run() - [WAIT]")
        from newspaper import Article

        '''
        Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm
        '''

        NOTES_LIST = [
            '118',
            '117',
            # '116',
            # '115',
        ]
        for note_id in NOTES_LIST:
            note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id)
            note.download()

            print '*' * 100
            # print 'H T M L'
            # print note.html
            #print '*' * 100
            # print 'T E X T'
            note.parse()
            print note.text


        logging.debug("run() - [DONE]")
Example #9
0
 def test_chinese_fulltext_extract(self):
     url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
     article = Article(url=url, language="zh")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Example #10
0
 def test_pre_parse_nlp(self):
     """Test running NLP algos before parsing the article
     """
     new_article = Article(self.article.url)
     html = mock_resource_with('cnn_article', 'html')
     new_article.download(html)
     self.assertRaises(ArticleException, new_article.nlp)
Example #11
0
def main():
    try:
        headlines = requests.get(headline_url)
        
        headlines = json.loads(headlines.text)
        for headline in headlines['Headlines']:
            print("Processing Article %s" % headline['Url'])
            article = Article(headline['Url'])
            article.download()
            article.parse()
            
            
            response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
            rdf = json.loads(response.text)
            
            for x in rdf:
                if '_type' in rdf[x] and 'name' in rdf[x]:
                    print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
                    for instance in rdf[x]['instances']:
                        text = instance['prefix'] + instance['suffix']
                        blob = TextBlob(text)
                        for sentence in blob.sentences:
                            print(sentence)
                            print(sentence.sentiment.polarity)
            print('--------------------')
            
            #print(rdf)
    except Exception as e:
        print ('Error in connect ' , e)
Example #12
0
 def check_url(args):
     """
     :param (basestr, basestr) url, res_filename:
     :return: (pubdate_failed, fulltext_failed)
     """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed',
                    res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed
Example #13
0
def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]
Example #14
0
def scrapeURLS(inFilPath):
    texts = []
    cache = loadCache()
    toDelURLs = []
    with open(inFilPath) as f:
        urls = f.readlines()
    for url in urls:
        if filter(urlFilters, url):
            toDelURLs.append(url)
            
        if url in cache:
            txt = cache[url]
        else:
            print "Scraping URL %s" % url
            article = Article(url)
            article.download()
            article.parse()
            txt = article.text.replace("\n", " ").replace("  ", " ").strip()
            if txt == "" or filter(txtFilter, txt):
                toDelURLs.append(url)
                continue
            cacheURL(url, txt)
        texts.append(txt)
        deleteURLs(inFilPath, toDelURLs)
    return texts
Example #15
0
 def test_chinese_fulltext_extract(self):
     url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'
     article = Article(url=url, language='zh')
     article.build()
     # assert isinstance(article.stopwords_class, StopWordsChinese)
     with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f:
         assert article.text == f.read()
Example #16
0
File: page.py Project: tfgg/ppsay
    def wrap_newspaper(self, web_page):
        parser = NewspaperArticle(url=web_page.final_url)
        parser.html = web_page.html
        parser.is_downloaded = True
        parser.parse()

        return parser
Example #17
0
def f(url):
	url_urls = url.text
	try:
		response = urllib2.urlopen(url_urls)
		status = response.code

		#print "detected webpage code:", status

		if(status == 404):
			pass
		else:
			a_zh = Article(url_urls, language = 'zh')
			a_zh.download()
			a_zh.parse()
			# content_urls = a_zh.text

			# if(content_urls == ''):
			# 	a_en = Article(url_urls, language = 'en')
			# 	a_en.download()
			# 	a_en.parse()
			# 	content_urls = content_urls + a_en.text

			# if(content_urls != ''):
			# 	pass
			# 	# compare_article(url_urls, content_urls)			
	except:
		pass
Example #18
0
 def test_download_file_success(self):
     url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
     article = Article(url=url)
     article.download()
     self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS)
     self.assertEqual(article.download_exception_msg, None)
     self.assertEqual(75406, len(article.html))
Example #19
0
 def test_download_file_failure(self):
     url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
     article = Article(url=url)
     article.download()
     self.assertEqual(0, len(article.html))
     self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE)
     self.assertEqual(article.download_exception_msg, "No such file or directory")
Example #20
0
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')
Example #21
0
def extract(url=None, keep_html=True):
    """ Attempts to extract article from URL """
    a = Article(url, keep_article_html=keep_html)
    try:
        a.download()
    except Exception, e:
        log.error('Error downloading %s: %s' % (url, str(e)))
Example #22
0
    def parse_news(self, response):
        item = ScrapyGooglenewsItem()
        #only log the warning info from request
        logging.getLogger("requests").setLevel(logging.WARNING)

        for href in response.xpath('//h2[@class="title"]/a/@href').extract():
            item['link'] = href
            #use newspaper-0.0.8 to scrape the webpage, then get clean text.
            article = Article(item['link'])
            article.download()
            article.parse()
            item['title'] = article.title
            item['text'] = article.text
            #item['authors'] = article.authors
            #item['date'] = article.publish_date

            if response.url.split('&')[-1] == 'topic=w':
                item['domain'] = 'World'
            if response.url.split('&')[-1] == 'topic=n':
                item['domain'] = 'U.S.'
            if response.url.split('&')[-1] == 'topic=b':
                item['domain'] = 'Business'
            if response.url.split('&')[-1] == 'topic=tc':
                item['domain'] = 'Technology'
            if response.url.split('&')[-1] == 'topic=e':
                item['domain'] = 'Entertainment'
            if response.url.split('&')[-1] ==  'topic=s':
                item['domain'] = 'Sports'
            if response.url.split('&')[-1] ==  'topic=snc':
                item['domain'] = 'Science'
            if response.url.split('&')[-1] ==  'topic=m':
                item['domain'] = 'Health'

            yield item
    def extract(self, item):
        """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
        parsing the HTML-Code.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """
        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name()

        article = Article('')
        article.set_html(item['spider_response'].body)
        article.parse()
        article_candidate.title = article.title
        article_candidate.description = article.meta_description
        article_candidate.text = article.text
        article_candidate.topimage = article.top_image
        article_candidate.author = article.authors
        if article.publish_date is not None:
            try:
                article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
            except ValueError as exception:
                self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
                              'Publishing date set to None' % item['url'])
        article_candidate.language = article.meta_lang

        return article_candidate
def insert_url(url):
    conn = sqlite3.connect('publico_news_sqllite3.db')
    cursor = conn.cursor()

    # get the article in plain text
    article = Article(url)
    article.download()
    article.parse()
    date = article.publish_date
    title = article.title
    text = article.text

    item = dict()
    item['datetime'] = date
    item['title'] = title
    item['text'] = text
    item['category'] = sys.argv[1].split('/')[6]
    item['link'] = sys.argv[1]
    item['origLink'] = sys.argv[1]

    print(item['category'])
    print(item['datetime'])

    if not duplicate(item, item['category'], cursor):
        status = insert_db(item, item['category'], cursor)
        if status == 1:
            print(sys.argv[1], "inserted")
        else:
            print("Error", status)
    else:
        print(url, "already in BD")

    conn.commit()
    conn.close()
Example #25
0
def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"
Example #26
0
    def runTest(self):
        # The "correct" fulltext needs to be manually checked
        # we have 50 so far
        FULLTEXT_PREPARED = 50
        domain_counters = {}

        with open(URLS_FILE, 'r') as f:
            urls = [d.strip() for d in f.readlines() if d.strip()]

        for url in urls[:FULLTEXT_PREPARED]:
            domain = get_base_domain(url)
            if domain in domain_counters:
                domain_counters[domain] += 1
            else:
                domain_counters[domain] = 1

            res_filename = domain + str(domain_counters[domain])
            html = mock_resource_with(res_filename, 'html')
            try:
                a = Article(url)
                a.download(html)
                a.parse()
            except Exception:
                print('<< URL: %s parse ERROR >>' % url)
                traceback.print_exc()
                continue

            correct_text = mock_resource_with(res_filename, 'txt')
            condensed_url = url[:30] + ' ...'
            print('%s -- fulltext status: %s' %
                  (condensed_url, a.text == correct_text))
 def test2(self):
     articles =[
      'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
      'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
      'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
      ]
     
     articles = [
      'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
      'http://www.bbc.co.uk/news/uk-wales-35954982',
      'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
      'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
      'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
      'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
      'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
      'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
      'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
     
     with open("./Output2.txt", "w") as text_file:
         for url in articles:
             print(url)
             a = Article(url)
             a.download()
             a.parse()
             text_file.write(a.text.encode('utf-8'))
             text_file.write('\n')
Example #28
0
def get_article(url):
    a = Article(url)
    a.download()
    a.parse()

    article = dict()

    article['title'] = a.title
    article['publish_date'] = a.published_date
    article['authors'] = a.authors
    article['lead_image'] = a.top_image
    article['movies'] = a.movies
    article['text'] = a.text
    article['keywords'] = get_keywords(a.text)


    # This is more likely to fail.
    # try:
    #     article.nlp()
    #     article['summary'] = 'This summary is generated: \n ' + a.summary
    # except Exception:
    #     print Exception
    #     article['summary'] = a.summary

    return article
Example #29
0
def get_image():
  url = request.args.get('url', '')
  if not url:
    abort(400)

  if is_image(url):
    return redirect(url)

  article = Article(url)
  article.download()

  try:
    article.parse()
  except (IOError, UnicodeDecodeError):
    return '', 422

  try:
    top_image = article.top_image.rsplit('?',1)[0]
  except AttributeError:
    top_image = ''

  if not top_image == '':
    return redirect(top_image)
  else:
    return '', 422
 def get_article_by_url(url):
     article = Article(url, fetch_images=False)
     article.download()
     if url == "empty":
         return "nolist"
     article.parse()
     return article.text
Example #31
0
    #soup = BeautifulSoup(html,"html5lib")
    soup = BeautifulSoup(html, "lxml")
    print(line)
    if soup.title:
        print(soup.title.string)
    regexp = re.compile("地址|电话")
    for b in soup.find_all(text=regexp):
        print(b)
    for a in soup.find_all('a'):
        key = a.string
        if isinstance(key, (str, bytes)):
            if re.search(pattern1, key):
                print('**************')
                print(key)
                if 'http' in a['href']:
                    try:
                        a = Article(a['href'], language='zh')
                        a.download()
                        a.parse()
                    except newspaper.article.ArticleException:
                        print(
                            'failed with 404 Client Error: Not Found for url')
                    print(a.text)
                else:
                    url = urllib.parse.urljoin(line.rstrip(), a['href'])
                    a = Article(url, language='zh')
                    a.download()
                    a.parse()
                    print(a.text)
                print('**************')
def getTitle(url):
    article = Article(url)
    article.download()
    article.html
    article.parse()
    return article.title
from newspaper import Article

a = Article(
    'http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html',
    keep_article_html=True)
a.download()
a.parse()
print(a.article_html)
from newspaper import Article
url = "https://www.marketwatch.com/story/heres-a-better-buy-and-hold-strategy-using-the-dow-jones-industrial-average-2019-02-26"
a = Article(url, language='en')  #English
a.download()
a.parse()
print(a.text)
from newspaper import Article
import random
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)

article = Article(
    'https://www.mayoclinic.org/diseases-conditions/chronic-kidney-disease/symptoms-causes/syc-20354521'
)
article.download()
article.parse()
article.nlp()
corpus = article.text
##print(corpus)

text = corpus
sentence_list = nltk.sent_tokenize(text)
##print(sentence_list)


def index_sort(list_var):
    length = len(list_var)
    list_index = list(range(0, length))
Example #36
0
# Import the libraries
from newspaper import Article
import nltk
from gtts import gTTS
import os

# Get the article
article = Article('https://www.poetryfoundation.org/poems/46945/baa-baa-black-sheep')

article.download()  # Download the article
article.parse()  # Parse the article
nltk.download('punkt')  # Download the 'punkt' package
article.nlp()  # Apply Natural Language Processing (NLP)

# Get the articles text
mytext = article.text

# Print the text
print(mytext)

# Language in which you want to convert
# language = 'pt-br' #Portuguese (Brazil)
language = 'en'  # English

# Passing the text and language to the engine,
# here we have marked slow=False. Which tells
# the module that the converted audio should
# have a high speed
myobj = gTTS(text=mytext, lang=language, slow=False)

# Saving the converted audio in a mp3 file named
Example #37
0
def triggers(request):
    if request.method == 'POST':
        print(request.POST)
        data = dict(request.POST)
        # Driver Code
        key = 'show_details'
        one = checkKey(data, key)
        key = 'check_triggers'
        two = checkKey(data, key)
        key = 'show_wordcloud'
        three = checkKey(data, key)
        key = 'hate_speech'
        four = checkKey(data, key)
        print(one, two, three)
        #URL Link case
        if (one == True):
            url = data['Link'][0]
            print(url)
            article = Article(url)
            article.download()
            article.parse()
            authors = article.authors
            publishdate = article.publish_date
            #article.text
            article.nlp()
            keywords = article.keywords
            articlesummary = article.summary
            return render(
                request, 'consciousApp/triggers.html', {
                    'authors': authors,
                    'publishdate': publishdate,
                    'keywords': keywords,
                    'articlesummary': articlesummary
                })
        #Show triggers
        elif (two == True):
            text = request.POST['input_text'].lower()
            triggers = [
                "9 11", "9-11", "9/11", "ableism", "abusive", "ageism",
                "alcoholism", "animal abuse", "animal death",
                "animal violence", "bestiality", "gore", "corpse", "bully",
                "cannibal", "car accident", "child abuse", "childbirth",
                "classism", "death", "decapitation", "abuse", "drug", "heroin",
                "cocaine", "eating disorder", "anorexia", "binge eating",
                "bulimia", "fatphobia", "forced captivity", "holocaust",
                "hitler", "homophobia", "hostage", "incest", "kidnap",
                "murder", "nazi", "overdose", "pedophilia", "prostitution",
                "PTSD", "racism", "racist", "rape", "raping", "scarification",
                "self-harm", "self harm", "cutting", "sexism", "slavery",
                "slurs", "suicide", "suicidal", "swearing", "terminal illness",
                "terrorism", "torture", "transphobia", "violence", "warfare"
            ]
            tw = []
            text_file = open(
                './consciousApp/static/consciousApp/input/triggercheckdata.txt',
                'w+')
            text_file.write(str(text))
            text_file.close()
            for trigger in triggers:
                if text.find(trigger) > -1: tw.append(trigger)
            if tw == []: tw.append('No Triggers Found')
            return render(request, 'consciousApp/triggers.html', {
                'text': text,
                'triggers': tw,
                'data': data
            })
        #Show_cloud
        elif (three == True):
            text = request.POST['input_text'].lower()
            tokens = word_tokenize(text)
            textdata = nltk.Text(tokens)
            stopwords = set(STOPWORDS)
            wordcloud = WordCloud(stopwords=stopwords,
                                  max_font_size=50,
                                  max_words=100,
                                  background_color="white").generate(text)
            wordcloud.to_file(
                "./consciousApp/static/consciousApp/output/word-cloud.png")
            data = "./../../static/consciousApp/output/word-cloud.png"
            return render(request, 'consciousApp/triggers.html',
                          {'data': data})

        elif (four == True):
            sonar = Sonar()
            text = request.POST['input_text'].lower()
            url = data['Link'][0]
            data = sonar.ping(text=text)["classes"]
            hate_speech = data[0]
            hate_speech_confidence = hate_speech["confidence"] * 100
            offensive_language = data[1]
            offensive_language_confidence = offensive_language[
                "confidence"] * 100
            neither = data[2]
            neither_confidence = neither["confidence"] * 100
            print(type(data))
            print(offensive_language_confidence * 100,
                  hate_speech_confidence * 100, neither_confidence * 100)
            return render(
                request, 'consciousApp/triggers.html', {
                    'hate_speech_confidence': hate_speech_confidence,
                    'offensive_language_confidence':
                    offensive_language_confidence,
                    'neither_confidence': neither_confidence
                })
    else:

        return render(request, 'consciousApp/triggers.html')
Example #38
0
from newspaper import Article

# A new article from TOI
url = "http://world.people.com.cn/n1/2019/0308/c1002-30964972.html"

# For different language newspaper refer above table
toi_article = Article(url, language='zh')  # zh for China

# To download the article
toi_article.download()

# To parse the article
toi_article.parse()

# To perform natural language processing ie..nlp
# toi_article.nlp()

# To extract title
print("Article's Title:")
print(toi_article.title)
print("*" * 80)

# To extract text
print("Article's Text:")
print(toi_article.text)
print("*" * 80)

# To extract summary
print("Article's Summary:")
print(toi_article.summary)
print("*" * 80)
Example #39
0
 def set_text(self):
     if not self.text and self.url:
         a = Article(self.url)
         a.download()
         a.parse()
         self.text = a.text
Example #40
0
from newspaper import Article

url = 'https://www.gazzetta.it/Calcio/Serie-A/Juventus/06-08-2019/dybala-altri-tesoretto-la-juve-andra-via-come-mandzukic-matuidi-3401461898595.shtml'
article = Article(url, language='en')
article.download()

article.parse()

print(article.title, "\n\n")
print(article.text)

# article.nlp()

# print(article.keywords)

with open("OUT.txt", "w") as text_file:
    text_file.write(article.text)
Example #41
0
    def crawling(self, category_name):
        # Multi Process PID
        print(category_name + " PID: " + str(os.getpid()))    

        writer = Writer(category_name=category_name, date=self.date)
        # 기사 URL 형식
        if (category_name == "연합뉴스속보"):
            url = "http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&sid1=001&sid2=140&oid=001&isYeonhapFlash=Y" \
                  + "&date="

        else:
            url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
                self.categories.get(category_name)) + "&date="

        # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
        day_urls = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])
        print(category_name + " Urls are generated")
        print("The crawler starts")

        for URL in day_urls:
            print(URL)
            regex = re.compile("date=(\d+)")
            news_date = regex.findall(URL)[0]

            request = self.get_url_data(URL)
            document = BeautifulSoup(request.content, 'html.parser')
            
            # html - newsflash_body - type06_headline, type06
            # 각 페이지에 있는 기사들 가져오기
            if (category_name == "연합뉴스속보"):
                post_temp = document.select('.newsflash_body .type02 li ')

            else:
                post_temp = document.select('.newsflash_body .type06_headline li dl')
                post_temp.extend(document.select('.newsflash_body .type06 li dl'))
           
            # 각 페이지에 있는 기사들의 url 저장
            post = []
            headlines = []
            companys = []

            
            for line in post_temp:
                post.append(line.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음
                try:
                    companys.append(line.find('span', class_="writing").text)
                except:
                    companys.append("err")
                try:
                    h = line.find_all('a')
                    if len(h) > 1:
                        headlines.append(h[1].text)
                    elif len(h) == 1:
                        headlines.append(h[0].text)
                    else:
                        headlines.append("err")
                except:
                    headlines.append("err")
            del post_temp
        
            
            print(len(post))

            for i in range(len(post)):  # 기사 URL
                # 크롤링 대기 시간
                print(i)
                sleep(0.01)
                content_url = post[i]
                
                # 기사 HTML 가져옴
                try:
                    article = Article(content_url, language='ko')
                    article.download()
                    article.parse()
                    text_sentence = article.text.strip()
                    text_company = companys[i]
                    text_headline = headlines[i].strip()
        ######################################################################
                    if self.keyword == 'initvalue':
                        wcsv = writer.get_writer_csv()
                        wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
                    else:
                        headline_to_words = text_headline.split()
                        if headline_to_words.index(self.keyword) >= 0:
                            wcsv = writer.get_writer_csv()
                            wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
        ######################################################################

                            
                except Exception as err:
                    print(err)
        
        writer.close()
        return        
Example #42
0
from gtts import gTTS  # text to speech conversion
import os  # Interacting with operating system
from io import BytesIO
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

# Get the article
article = Article(
    'https://hackernoon.com/how-to-launch-your-own-blockchain-choosing-the-right-consensus-part-ii-y07y32tv')

article.download()  # Download the article
article.parse()  # Parse the article
nltk.download('punkt')  # Download 'punkt' package
article.nlp()  # Apply nlp (Natural Language Processing)

# Get the article text
my_text = article.text

# Print the text
print(my_text)

# Choose language for tts
language = 'en'  # English
language2 = 'fr'
Example #43
0
def getData():
    url = request.args.get('url')
    # From Newspaper Framework getting required data
    content = Article(url)
    content.download()
    content.parse()
    title = content.title
    rawText = content.text
    # Unformatted Data to show to user
    textDisplay = rawText.split("\n\n")
    textDisplay = ''.join(textDisplay)
    # Converting numbered text to digits
    t2d = text2digits.Text2Digits()
    numText = t2d.convert(rawText)
    text = numText.split("\n\n")
    text = ''.join(text)
    # Implemented API data limit restriction
    if len(text) < 5000:
        text = text
    else:
        text = text[:5000]
    jsonData = {"text": text}
    configDataResource = os.path.join(SITE_ROOT, "data", "configdata.json")
    configData = json.load(open(configDataResource))

    # NER API call request
    headers = {
        'x-api-key': configData["X_API_KEY"],
        'Content-type': 'application/json'
    }
    ner_response = requests.post(
        configData["NAMED_ENTITY_RECOGNITION_ENDPOINT"],
        headers=headers,
        data=json.dumps(jsonData))
    # print(ner_response.text)
    # Deserializing the response
    places = lambda: None
    places.__dict__ = json.loads(ner_response.text)
    print(places.LOC)

    json_url = os.path.join(SITE_ROOT, "data", "sg-citi.json")
    data = json.load(open(json_url))

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    LOC = []
    CASE = []
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        if ent.label_ == "CARDINAL":
            CASE.append(ent.text)
        if ent.label_ == "GPE":
            LOC.append(ent.text)

    count = []
    for i in CASE:
        if i.isdigit():
            if i not in count:
                count.append(i)
    print("COUNT: ", count)
    if not len(count):
        count = list(i for i in range(80, 500, 7))
    returnJson = {
        "text": textDisplay,
        "location": [],
        "category": ner_response.text
    }
    for i in places.LOC:
        for citi in data:
            if i in citi["name"] and citi["name"] not in returnJson["location"]:
                returnJson["location"].append({
                    "name":
                    citi["name"],
                    "lat":
                    "no1",
                    "lon":
                    "no2",
                    "count":
                    count[random.randrange(0, len(count))]
                })
                break
    print(returnJson)
    return jsonify(returnJson)
Example #44
0
def HindustanTimesScrapper():

    SRC = KNOWN_NEWS_SOURCES["Hindustan Times"]

    data1 = get_chronological_headlines(SRC["pages"].format(1))
    data2 = get_trending_headlines(SRC["home"])
    text_lst = []
    url_lst = []
    date_lst = []
    title_lst = []
    try :
        for data in data1:
            if data["content"] == "NA":
                try:
                    article = Article(data["link"])
                    article.download()
                    article.parse()
                    article.nlp()
                    summary = article.text
                    text_lst.append(summary)
                except:
                    text_lst.append(data["content"])
            else:
                text_lst.append(data["content"])
            url_lst.append(data["link"])
            date = data["published_at"]
            if(date == None) :
                date = datetime.now()
            date_lst.append(date)
            try :
                title_lst.append(data["title"])
            except:
                title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0])
        for data in data2:
            if data["content"] == "NA":
                try:
                    article = Article(data["link"])
                    article.download()
                    article.parse()
                    article.nlp()
                    summary = article.text
                    text_lst.append(summary)
                except:
                    text_lst.append(data["content"])
            else:
                text_lst.append(data["content"])
            url_lst.append(data["link"])
            date = data["published_at"]
            if(date == None) :
                date = datetime.now()
            date_lst.append(date)
            try :
                title_lst.append(data["title"])
            except:
                title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0])

        df_raw = pd.DataFrame(list(zip(text_lst, url_lst, date_lst, title_lst)), columns=["text", "url", "date", "headline"])

        df_crime = get_crime(df_raw)
        data = get_data("./database/data.json")
        df = get_location(df_crime, data)
        df = preprocessing2(df, data)
        return df.reset_index(drop=True)
    except :
        return pd.DataFrame(columns=["index","text","url","crime","location","region","city","date","headline"])
    def parse_content(self, response):
        #这个函数用作新闻的具体解析

        ID = 'songtengteng'

        website_name = '商务部贸易救济调查局'

        # 网站板块
        website_block = response.xpath(
            "//div[@class='position']/a[2]/text()").extract_first()

        news_url = response.meta['url']

        # 作者
        news_author_list = response.xpath('//script')
        if len(news_author_list) != 0:
            news_author = news_author_list.re(
                'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"')[0][13:].replace(
                    '"', '')
        else:
            news_author = '商务部贸易救济调查局'

        # 新闻发布时间,统一格式:YYYY MM DD HH:Mi:SS           v.{2}\stm\s=\s\".*\"
        publish_time = response.meta['publish_time']
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-8:]
        publish_time = year + month + day + ' ' + juti_time

        # 新闻自带标签
        news_tags = response.xpath('//script').re(
            'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '')

        # 新闻标题
        news_title = response.xpath('//h3/text()').extract_first()

        # 新闻正文
        a = Article(response.url, language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        #获取文章的图片和名称
        image_urls = []
        image_names = []
        image_urls1 = response.xpath(
            '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src'
        ).extract()
        if image_urls1 != []:
            image_urls = image_urls1
            for i in range(len(image_urls)):
                if i < 10 and i >= 0:
                    image_name = news_title + '_000' + str(i)
                    image_names.append(image_name)
                elif i < 100 and i >= 10:
                    image_name = news_title + '_00' + str(i)
                    image_names.append(image_name)
                elif i < 1000 and i >= 100:
                    image_name = news_title + '_0' + str(i)
                    image_names.append(image_name)
                else:
                    image_name = news_title + str(i)
                    image_names.append(image_name)

        yield self.getItem(
            id=ID,
            news_url=news_url,
            website_name=website_name,
            website_block=website_block,
            news_title=news_title,
            publish_time=publish_time,
            news_author=news_author,
            news_tags=news_tags,
            news_content=news_content,
            image_urls=image_urls,
            image_names=image_names,
        )
Example #46
0
import random
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import yaml
import pyaudio
import speech_recognition as sr

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
### website from where we want to extract the data
article1 = Article('https://en.wikipedia.org/wiki/Coronavirus')
article1.download()
article1.parse()
article1.nlp()

article2 = Article('https://www.euro.who.int/en/health-topics/noncommunicable-diseases/mental-health/data-and-resources/mental-health-and-covid-19')
article2.download()
article2.parse()
article2.nlp()

article3 = Article('https://www.healthline.com/health-news/what-covid-19-is-doing-to-our-mental-health')
article3.download()
article3.parse()
article3.nlp()

article4 = Article('https://www.webmd.com/lung/coronavirus')
Example #47
0
from newspaper import Article
article = Article(
    'https://www.firstpost.com/health/narendra-modis-speech-on-coronavirus-pm-announces-total-lockdown-for-three-weeks-but-essential-services-to-remain-open-key-takeways-8185551.html',
    language='en')
article.download()
article.parse()
article.nlp()
print(article.summary)
Example #48
0
import spacy
from newspaper import Article
nlp = spacy.load("en_core_web_sm")

url_1 = 'https://www.wsj.com/articles/u-s-officials-walk-out-of-meeting-at-presidential-palace-in-kabul-11553628051'
url_2 = 'https://www.wsj.com/articles/iran-moves-to-cement-its-influence-in-syria-11553632926'
article_1 = Article(url_1)
article_2 = Article(url_2)
article_1.download()
article_2.download()
article_1.parse()
article_2.parse()

article_stream = [article_1.text, article_2.text]

for doc in nlp.pipe(article_stream, batch_size=50):
    print(doc.vocab)

# for entity in doc.ents:
#     print(entity.text, entity.start_char, entity.end_char, entity.label_)
            row_type = row['Type']
            url = row['URL']
            if row_type != 'None':
                #print(url)
                Aritle_URLs.append(url)


title_create(filename)

Article_Date = []
Article_Title = []
count = 1
for url in Aritle_URLs:

    print("Number : ", count)
    article = Article(url)
    article.download()
    if article.download_state == 0:
        print("Retrying in 25 seconds!!")
        time.sleep(25)
        article.download()
    if article.download_state != 1:
        print("GOOD")
        article.parse()
        dated = str(article.publish_date)
        if dated != 'None':
            the_date = (dated[:10])
            the_date = datetime.datetime.strptime(
                the_date, '%Y-%M-%d').strftime('%m/%d/%Y')
            fixed_date = month + the_date[2:10]
            print(fixed_date)
Example #50
0
def get_article_similarity(url1, url2, log_articles=False):
    try:
        # Download and parse first article
        article1 = Article(url1, browser_user_agent=choice(static.HEADERS))
        article1.download()
        article1.parse()
        article1_text = article1.text

        # Download and parse second article
        article2 = Article(url2, browser_user_agent=choice(static.HEADERS))
        article2.download()
        article2.parse()
        article2_text = article2.text

        if log_articles:
            log.debug(f"Article 1: {article1_text}\n\nArticle 2: {article2_text}")

        # Compare the two articles and return the ratio (0-1)
        return SequenceMatcher(None, article1_text, article2_text).ratio()

    except (ArticleException, Exception):
        log.error(traceback.format_exc())
        log.warning("Couldn't compare articles")
        return None
Example #51
0
def getArticle(url):
  article = Article(url)
  article.download()
  article.parse()
  return article
Example #52
0
    a = 0
    print("ANZAHL: " + str(a))
    for feed in feeds:
        d = feedparser.parse(feed[0])
        for entrie in d.entries:
            #check link
            url = entrie.link
            #check date

            #check if in db

            #parse

            #title and
            try:
                article = Article(url, language='de', keep_article_html=True)
                article.download()
                article.parse()

                filename = feed[3] + ''.join(
                    random.choices(string.ascii_uppercase + string.digits,
                                   k=8))

                file = open("html/" + filename + ".html", "w")

                article.article_html = "<meta property='baseurl' content='" + feed[
                    4] + "'>" + article.article_html
                article.article_html = "<script src='https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js'></script>" + article.article_html
                article.article_html = "<link href='https://fonts.googleapis.com/css?family=Slabo+27px' rel='stylesheet'>" + article.article_html
                article.article_html = "<img src='" + article.top_image + "' width='100%' >" + article.article_html
                article.article_html = "<h1 class='realTitle'>" + entrie.title + "</h1>" + article.article_html
url = "https://www.newindianexpress.com"
page = requests.get(url)

soup = BeautifulSoup(page.text, 'html.parser')

articles = soup.findAll('a', class_="article_click")
news = []
for row in articles:
    news.append(row['href'])
    #link = articles[row].find('a')['href']
    #news.append(link)

dataset = []
for i in news:
    article = Article(i, language="en")
    article.download()
    article.parse()
    article.nlp()
    data = {}
    data['Title'] = article.title
    data['Text'] = article.text
    data['Summary'] = article.summary
    data['Keywords'] = article.keywords
    dataset.append(data)

#print(data)
df = pd.DataFrame(dataset)

# Importing the dataset
uci_dataset = pd.read_csv('OnlineNewsPopularity.csv',
Example #54
0
def align_row_text():
    source_text = request.form['source_text']
    target_text = request.form['target_text']
    # check if source and target are urls
    url_rex = r"(?i)\b(?:(?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\))+(?:\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    if re.fullmatch(url_rex, source_text.strip().lower()):
        src_article = Article(source_text.strip())
        src_article.download()
        src_article.parse()
        source_text = src_article.title + "\n" + src_article.text
    if re.fullmatch(url_rex, target_text.strip().lower()):
        tar_article = Article(target_text.strip())
        tar_article.download()
        tar_article.parse()
        target_text = tar_article.title + "\n" + tar_article.text

    # segment source and target

    src_lang_code = lang_detect.detect(source_text)
    tar_lang_code = lang_detect.detect(target_text)

    if src_lang_code == 'zh-cn':
        srx_src_code = 'Generic'
    else:
        srx_src_code = src_lang_code

    if tar_lang_code == 'zh-cn':
        srx_tar_code = 'Generic'
    else:
        srx_tar_code = tar_lang_code

    srx_rules = srx_segmenter.parse(srx_file_path)
    seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_src_code],
                                             source_text)
    source_list = seg_results.extract()[0]
    seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_tar_code],
                                             target_text)
    target_list = seg_results.extract()[0]
    # translate target
    target_mt_list = mt_helpers.google_translate_chunk_by_chunk(
        target_list, tar_lang_code, src_lang_code)
    # align
    # initiate the alignment class
    algorithm = request.form.get('algorithm', 'fuzzy')
    align_options = {
        "location_weight":
        float(request.form.get('input_location_weight', 0.2)),
        "length_weight":
        float(request.form.get('input_length_weight', 0.1)),
        "meta_weight":
        float(request.form.get('input_length_weight', 0.1)),
        "semantic_weight":
        float(request.form.get('input_semantic_weight', 0.6)),
        "search_range":
        float(request.form.get('input_paragraph_size', 5)),
        "minimum_semantic_score":
        float(request.form.get('input_minimum_semantic_score', 0.5)),
        "minimum_partial_sem_match":
        0.1,
        "minimum_length_score":
        float(request.form.get('input_minimum_length_score', 0.6))
    }

    if algorithm == 'fuzzy':
        semantic_class = fuzzy_comp.FuzzyComp
    else:
        semantic_class = tfidf_scikit.TfidfComp
    alg = TranslationAligner()

    alg.align(semantic_class,
              source_list,
              target_list, [],
              target_mt_list,
              options=align_options)
    # save json file to a random file name under static files and return it with the results
    temp_file_name = ''.join(
        random.choices(string.ascii_uppercase + string.digits, k=10))
    temp_json_file_name = temp_file_name + ".json"
    alg.export_json_dict(os.path.join(export_path, temp_json_file_name))
    del alg
    return {"json_file_name": temp_json_file_name}
        mycursor = mydb.cursor()
        sql = "SELECT * FROM Wordpress order by dominio asc"
        mycursor.execute(sql)
        sql = mycursor.fetchall()
        for portal in sql:
            try:
                if portal[7] is not None:
                    d = fp.parse(portal[7])
                    for entry in d.entries:
                        # Check if publish date is provided, if no the article is skipped.
                        # This is done to keep consistency in the data and to keep the script from crashing.
                        if hasattr(entry, 'published'):

                            try:
                                content = Article(entry.link)
                                content.download()
                                content.parse()
                            except Exception as e:
                                # If the download for some reason fails (ex. 404) the script will continue downloading
                                # the next article.
                                print(e)
                                print("continuing...")
                                continue
                            try:
                                twitter = content.meta_data["twitter"]
                            except Exception as e:
                                print("twitter")
                            try:
                                og = content.meta_data["og"]
                            except Exception as e:
Example #56
0
    def parse_artical(self, response):  # 具体文章解析
        ID = 'songtengteng'

        # 新闻链接
        news_url = response.meta['url']

        # 新闻标题
        news_title = response.xpath('//h1/text()').extract_first()

        # 作者
        a = response.xpath(
            '//div[@class="info-source"]/span/a/text()').extract_first()
        if a == None:
            news_author = ''
        else:
            news_author = a

        # 发布时间
        publish_time = response.xpath(
            '//div[@class="info-source"]/span[2]/text()').extract_first()
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-5:]
        publish_time = year + month + day + ' ' + juti_time + ':' + '00'

        # 正文
        '''可以考虑下使用文章密度算法来快速解析文章正文'''
        a = Article(response.meta['url'], language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        # 标签
        news_tags = ''

        #图片
        image_urls1 = response.xpath('//p[@class="pi"]/img/@src').extract()
        image_urls = []
        image_names = []
        if image_urls1 != []:
            for i in range(len(image_urls1)):
                image_url = image_urls1[i]
                image_urls.append(image_url)
                if i >= 0 and i < 10:
                    image_title = news_title + '000' + str(i)
                elif i >= 10 and i < 100:
                    image_title = news_title + '00' + str(i)
                elif i >= 100 and i < 1000:
                    image_title = news_title + '0' + str(i)
                else:
                    image_title = news_title + str(i)
                image_names.append(image_title)

        yield self.getItem(id=ID,
                           news_url=news_url,
                           website_name='搜狐焦点',
                           website_block='访谈',
                           news_title=news_title,
                           publish_time=publish_time,
                           news_author=news_author,
                           news_tags=news_tags,
                           news_content=news_content,
                           image_urls=image_urls,
                           image_names=image_names)
Example #57
0
def get_article(url):
    article = Article(url, language='en')
    article.download()
    article.parse()
    return article
Example #58
0
        links.append(i.get('href'))
data = {}
count = 0
for i in links:
    urls = 'https://english.mathrubhumi.com/' + i
    page = requests.get(urls).text
    soup = BeautifulSoup(page)
    headline = soup.find("div", {"class": "common_text_en date_outer"})
    if headline:
        date = headline.get_text().strip()
        date_time = datetime.strptime(date[:-4], '%b %d, %Y, %I:%M %p')
        parag = soup.find("div", {"class": "articleBody common_text"})
        para = parag.find_all('p')
        place = para[0].get_text().split(':')[0]
        if date_time > recent:
            article = Article(urls, 'en')
            article.download()
            article.parse()
            article.nlp()
            summary = article.summary
            data[count] = [date_time, place, summary, article.keywords]
            count += 1

links = []
url = 'https://www.onmanorama.com/districts/'
districts = [
    'alappuzha', 'ernakulam', 'idukki', 'kannur', 'kasaragod', 'kollam',
    'kottayam', 'kozhikode', 'malappuram', 'palakkad', 'pathanamthitta',
    'thiruvananthapuram', 'thrissur', 'wayanad'
]
for d in districts:
Example #59
0
def scrape_analyze(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text
Example #60
0
     newsPaper = {
         "rss": value['rss'],
         "link": value['link'],
         "articles": []
     }
     for entry in d.entries:
         if hasattr(entry, 'published'):
             if count > LIMIT:
                 break
             article = {}
             article['link'] = entry.link
             date = entry.published_parsed
             article['published'] = datetime.fromtimestamp(
                 mktime(date)).isoformat()
             try:
                 content = Article(entry.link)
                 content.download()
                 content.parse()
             except Exception as e:
                 print(e)
                 print("continuing...")
                 continue
             article['title'] = content.title
             article['text'] = content.text
             newsPaper['articles'].append(article)
             print(count, "articles downloaded from", company, ", url: ",
                   entry.link)
             count = count + 1
 else:
     # This is the fallback method if a RSS-feed link is not provided.
     # It uses the python newspaper library to extract articles