Python Article.parse Exemples, newspaper.Article.parse Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tf_idf.py Projet : olykos/SummaryTweets

	def extract(self, article_url):

		article = Article(url=article_url)
		article.download()
		article.parse()

		return article.text

Exemple #2

0

Afficher le fichier

Fichier : detect_cmp.py Projet : BersaKAIN/newshub

def get_article():
	tree_urls = ET.parse("DB_urls.xml")
	root_urls = tree_urls.getroot()

	# The problem with English and Chinese can be solved with 
	for field_urls in root_urls.findall("row"):
		url_urls = field_urls.find("field").text
	#	url_urls = 'http://news.sina.com.cn/c/2014-04-21/204729980947.shtml'
	#	url_urls = 'http://china.caixin.com/2013-12-30/100623243.html'

		try:
			response = urllib2.urlopen(url_urls)
			status = response.code

			#print "detected webpage code:", status

			if(status == 404):
				continue
			else:
				a_zh = Article(url_urls, language = 'zh')
				a_zh.download()
				a_zh.parse()
				content_urls = a_zh.text

				if(content_urls == ''):
					a_en = Article(url_urls, language = 'en')
					a_en.download()
					a_en.parse()
					content_urls = content_urls + a_en.text

				if(content_urls != ''):
					compare_article(url_urls, content_urls)			
		except:
			pass

Exemple #3

0

Afficher le fichier

Fichier : unit_tests.py Projet : WheresWardy/newspaper

 def test_arabic_fulltext_extract(self):
     url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html"
     article = Article(url=url, language="ar")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()

Exemple #4

0

Afficher le fichier

Fichier : fetch.py Projet : getaccent/accent-backend

def parse_article(url, lang, featured=0, db=connect_db()):
    cur = db.execute("select * from articles where url=?", (url,))
    entries = [dict(id=row[0], url=row[1], title=row[2], image=row[3], text=row[4], authors=row[5], date=row[6], featured=row[7], language=row[8]) for row in cur.fetchall()]

    if len(entries) >= 1:
        return entries[0]

    article = Article(url)
    article.download()

    try:
        article.parse()
    except:
        return None

    title = article.title
    image = article.top_image
    text = article.text
    authors = ",".join(article.authors)
    date = int(time.mktime(article.publish_date.timetuple())) if type(article.publish_date) is datetime.datetime else 0

    db.execute("insert into articles (url, title, image, text, authors, date, featured, language) values (?, ?, ?, ?, ?, ?, ?, ?)", (url, title, image, text, authors, date, featured and len(text) >= 50, lang))
    db.commit()

    idquery = db.execute("select (id) from articles where url=?", (url,))
    id = [row[0] for row in idquery.fetchall()][0]

    return {"id": id, "url": url, "title": title, "image": image, "text": text, "authors": authors, "date": date, "language": lang}

Exemple #5

0

Afficher le fichier

Fichier : unit_tests.py Projet : WheresWardy/newspaper

 def test_spanish_fulltext_extract(self):
     url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
     article = Article(url=url, language="es")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()

Exemple #6

0

Afficher le fichier

Fichier : app.py Projet : USA-Hacks/Politik-Back

def get_nlp_data(url):
	article = Article(url)
	article.download()
	article.parse()
	article.nlp()
	
	return json.dumps(article.keywords)

Exemple #7

0

Afficher le fichier

Fichier : unit_tests.py Projet : WheresWardy/newspaper

 def test_chinese_fulltext_extract(self):
     url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
     article = Article(url=url, language="zh")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()

Exemple #8

0

Afficher le fichier

Fichier : test.py Projet : tallstreet/pr-classifier

def main():
    try:
        headlines = requests.get(headline_url)
        
        headlines = json.loads(headlines.text)
        for headline in headlines['Headlines']:
            print("Processing Article %s" % headline['Url'])
            article = Article(headline['Url'])
            article.download()
            article.parse()
            
            
            response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
            rdf = json.loads(response.text)
            
            for x in rdf:
                if '_type' in rdf[x] and 'name' in rdf[x]:
                    print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
                    for instance in rdf[x]['instances']:
                        text = instance['prefix'] + instance['suffix']
                        blob = TextBlob(text)
                        for sentence in blob.sentences:
                            print(sentence)
                            print(sentence.sentiment.polarity)
            print('--------------------')
            
            #print(rdf)
    except Exception as e:
        print ('Error in connect ' , e)

Exemple #9

0

Afficher le fichier

Fichier : scrapper.py Projet : tiagoprn/experiments

    def run(self):
        logging.debug("run() - [WAIT]")
        from newspaper import Article

        '''
        Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm
        '''

        NOTES_LIST = [
            '118',
            '117',
            # '116',
            # '115',
        ]
        for note_id in NOTES_LIST:
            note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id)
            note.download()

            print '*' * 100
            # print 'H T M L'
            # print note.html
            #print '*' * 100
            # print 'T E X T'
            note.parse()
            print note.text


        logging.debug("run() - [DONE]")

Exemple #10

0

Afficher le fichier

Fichier : main.py Projet : cilsat/perisalah-corpus

def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]

Exemple #11

0

Afficher le fichier

Fichier : views.py Projet : WChen1127/my-first-blog

def post_new(request):
    if request.method == "POST":
        form = PostForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            post.author = request.user
            post.published_date = timezone.now()
            post.save()
            return redirect('blog.views.post_detail', pk=post.pk)
    elif request.method == 'GET':
        url = request.GET.get('url', '')
               
        if len(url) > 5:
            article = Article(url, language='en')
            article.download()
            article.parse()
            article.nlp()
            image = article.top_image
            summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
            title = article.title.replace(u'\u2019',"\'")
            source = url.split('//')[1].split('/')[0].replace('www.','')
            status = 'UD'
            form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) 
        else:
            form = PostForm() 

    return render(request, 'blog/post_edit.html', {'form': form})

Exemple #12

0

Afficher le fichier

Fichier : nlp.py Projet : pjdrm/EM_pjdrm

def scrapeURLS(inFilPath):
    texts = []
    cache = loadCache()
    toDelURLs = []
    with open(inFilPath) as f:
        urls = f.readlines()
    for url in urls:
        if filter(urlFilters, url):
            toDelURLs.append(url)
            
        if url in cache:
            txt = cache[url]
        else:
            print "Scraping URL %s" % url
            article = Article(url)
            article.download()
            article.parse()
            txt = article.text.replace("\n", " ").replace("  ", " ").strip()
            if txt == "" or filter(txtFilter, txt):
                toDelURLs.append(url)
                continue
            cacheURL(url, txt)
        texts.append(txt)
        deleteURLs(inFilPath, toDelURLs)
    return texts

Exemple #13

0

Afficher le fichier

Fichier : unit_tests.py Projet : Newspad/newspaper

 def check_url(args):
     """
     :param (basestr, basestr) url, res_filename:
     :return: (pubdate_failed, fulltext_failed)
     """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed',
                    res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed

Exemple #14

0

Afficher le fichier

Fichier : page.py Projet : tfgg/ppsay

    def wrap_newspaper(self, web_page):
        parser = NewspaperArticle(url=web_page.final_url)
        parser.html = web_page.html
        parser.is_downloaded = True
        parser.parse()

        return parser

Exemple #15

0

Afficher le fichier

Fichier : detect_cmp.py Projet : shujianbu/newshub

def f(url):
	url_urls = url.text
	try:
		response = urllib2.urlopen(url_urls)
		status = response.code

		#print "detected webpage code:", status

		if(status == 404):
			pass
		else:
			a_zh = Article(url_urls, language = 'zh')
			a_zh.download()
			a_zh.parse()
			# content_urls = a_zh.text

			# if(content_urls == ''):
			# 	a_en = Article(url_urls, language = 'en')
			# 	a_en.download()
			# 	a_en.parse()
			# 	content_urls = content_urls + a_en.text

			# if(content_urls != ''):
			# 	pass
			# 	# compare_article(url_urls, content_urls)			
	except:
		pass

Exemple #16

0

Afficher le fichier

Fichier : extractor.py Projet : lightyrs/squirrel

def extract():
  url = sys.argv[1:].pop()

  a = Article(url, keep_article_html=True)
  a.download()
  a.parse()
  a.nlp()

  parsed_uri = urlparse(a.source_url)
  domain = '{uri.netloc}'.format(uri=parsed_uri)

  try:
    publish_date = a.publish_date.strftime('%Y-%m-%d %H:%M')
  except AttributeError:
    publish_date = ""

  try:
    authors = ", ".join(a.authors)
  except AttributeError:
    authors = ""

  result = {}
  result['html'] = a.html
  result['body'] = a.text
  result['title'] = a.title
  result['top_image'] = a.top_image
  result['author'] = authors
  result['html_body'] = a.article_html
  result['favicon'] = a.meta_favicon
  result['description'] = a.summary
  result['publish_date'] = publish_date
  result['keywords'] = a.keywords
  result['sitename'] = re.sub(r"^www.", "", domain)

  return json.dumps(result).encode('utf-8')

Exemple #17

0

Afficher le fichier

Fichier : __init__.py Projet : chagge/newspaper-demo

def show_article():
    url_to_clean = request.args.get('url_to_clean')
    if not url_to_clean:
        return redirect(url_for('index'))

    article = Article(url_to_clean)
    article.download()
    article.parse()

    try:
      html_string = ElementTree.tostring(article.clean_top_node)
    except:
      html_string = "Error converting html to string."

    try:
      article.nlp()
    except:
      log.error("Couldn't process with NLP")

    a = {
          'html': html_string, 
         'authors': str(', '.join(article.authors)), 
         'title': article.title,
         'text': article.text,
         'top_image': article.top_image,
         'videos': str(', '.join(article.movies)),
         'keywords': str(', '.join(article.keywords)),
         'summary': article.summary
         }
    return render_template('article/index.html', article=a, url=url_to_clean)

Exemple #18

0

Afficher le fichier

Fichier : preprocess.py Projet : Anhmike/PyTLDR

def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')

Exemple #19

0

Afficher le fichier

Fichier : app.py Projet : beckenkamp/news

def get_news():
    urls = get_urls()
    news = News.query.with_entities(News.source_url).all()

    used_urls = []
    for n in news:
        used_urls.append(n[0])

    for url in urls:
        if not url in used_urls:
            used_urls.append(url)

            article = Article(url, language='pt', keep_article_html=True)
            article.download()
            article.parse()
            article.nlp()

            news_article = News(url)
            news_article.slug = slugify(article.title)
            news_article.title = article.title
            news_article.text = article.text
            news_article.top_image = article.top_image
            news_article.summary = article.summary
            news_article.article_html = article.article_html
            news_article.created_at = datetime.datetime.now()

            exists_this_news = News.query.filter_by(source_url=url).first()

            if not exists_this_news:
                print(url)
                db.session.add(news_article)
                db.session.commit()

Exemple #20

0

Afficher le fichier

Fichier : spider.py Projet : jyb002/Scrapy_GoogleNews

    def parse_news(self, response):
        item = ScrapyGooglenewsItem()
        #only log the warning info from request
        logging.getLogger("requests").setLevel(logging.WARNING)

        for href in response.xpath('//h2[@class="title"]/a/@href').extract():
            item['link'] = href
            #use newspaper-0.0.8 to scrape the webpage, then get clean text.
            article = Article(item['link'])
            article.download()
            article.parse()
            item['title'] = article.title
            item['text'] = article.text
            #item['authors'] = article.authors
            #item['date'] = article.publish_date

            if response.url.split('&')[-1] == 'topic=w':
                item['domain'] = 'World'
            if response.url.split('&')[-1] == 'topic=n':
                item['domain'] = 'U.S.'
            if response.url.split('&')[-1] == 'topic=b':
                item['domain'] = 'Business'
            if response.url.split('&')[-1] == 'topic=tc':
                item['domain'] = 'Technology'
            if response.url.split('&')[-1] == 'topic=e':
                item['domain'] = 'Entertainment'
            if response.url.split('&')[-1] ==  'topic=s':
                item['domain'] = 'Sports'
            if response.url.split('&')[-1] ==  'topic=snc':
                item['domain'] = 'Science'
            if response.url.split('&')[-1] ==  'topic=m':
                item['domain'] = 'Health'

            yield item

Exemple #21

0

Afficher le fichier

Fichier : RSSFeedParser.py Projet : AndreyPrvt/Article-parser

 def get_article_by_url(url):
     article = Article(url, fetch_images=False)
     article.download()
     if url == "empty":
         return "nolist"
     article.parse()
     return article.text

Exemple #22

0

Afficher le fichier

Fichier : newspaper_extractor.py Projet : Sayeedsalam/spec-event-data-server

    def extract(self, item):
        """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
        parsing the HTML-Code.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """
        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name()

        article = Article('')
        article.set_html(item['spider_response'].body)
        article.parse()
        article_candidate.title = article.title
        article_candidate.description = article.meta_description
        article_candidate.text = article.text
        article_candidate.topimage = article.top_image
        article_candidate.author = article.authors
        if article.publish_date is not None:
            try:
                article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
            except ValueError as exception:
                self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
                              'Publishing date set to None' % item['url'])
        article_candidate.language = article.meta_lang

        return article_candidate

Exemple #23

0

Afficher le fichier

Fichier : fetch-news-v4.py Projet : davidsbatista/publico.pt-news-scrapper

def insert_url(url):
    conn = sqlite3.connect('publico_news_sqllite3.db')
    cursor = conn.cursor()

    # get the article in plain text
    article = Article(url)
    article.download()
    article.parse()
    date = article.publish_date
    title = article.title
    text = article.text

    item = dict()
    item['datetime'] = date
    item['title'] = title
    item['text'] = text
    item['category'] = sys.argv[1].split('/')[6]
    item['link'] = sys.argv[1]
    item['origLink'] = sys.argv[1]

    print(item['category'])
    print(item['datetime'])

    if not duplicate(item, item['category'], cursor):
        status = insert_db(item, item['category'], cursor)
        if status == 1:
            print(sys.argv[1], "inserted")
        else:
            print("Error", status)
    else:
        print(url, "already in BD")

    conn.commit()
    conn.close()

Exemple #24

0

Afficher le fichier

Fichier : test_data_helpers.py Projet : boztaskent/tensorflow-named-entity

 def test2(self):
     articles =[
      'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
      'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
      'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
      ]
     
     articles = [
      'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
      'http://www.bbc.co.uk/news/uk-wales-35954982',
      'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
      'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
      'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
      'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
      'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
      'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
      'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
     
     with open("./Output2.txt", "w") as text_file:
         for url in articles:
             print(url)
             a = Article(url)
             a.download()
             a.parse()
             text_file.write(a.text.encode('utf-8'))
             text_file.write('\n')

Exemple #25

0

Afficher le fichier

Fichier : traverse.py Projet : abhi21/news

def is_valid_article(link):
    print("Checking valid:\n" + link)

    if "cnn.com" not in link:
        return False
    if "html" not in link:
        return False
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    keywords = article.keywords

    matched = False

    for key in keywords:
        if key in nc_set:
            matched = True
    for key in keywords:
        if key in contorversial_set:
            matched = False

    if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
        main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
        visited_articles.write(link+"\n")
        return True

    return False

Exemple #26

0

Afficher le fichier

Fichier : cheapskate.py Projet : mskog/cheapskate

def get_image():
  url = request.args.get('url', '')
  if not url:
    abort(400)

  if is_image(url):
    return redirect(url)

  article = Article(url)
  article.download()

  try:
    article.parse()
  except (IOError, UnicodeDecodeError):
    return '', 422

  try:
    top_image = article.top_image.rsplit('?',1)[0]
  except AttributeError:
    top_image = ''

  if not top_image == '':
    return redirect(top_image)
  else:
    return '', 422

Exemple #27

0

Afficher le fichier

Fichier : unit_tests.py Projet : pombredanne/newspaper

    def runTest(self):
        # The "correct" fulltext needs to be manually checked
        # we have 50 so far
        FULLTEXT_PREPARED = 50
        domain_counters = {}

        with open(URLS_FILE, 'r') as f:
            urls = [d.strip() for d in f.readlines() if d.strip()]

        for url in urls[:FULLTEXT_PREPARED]:
            domain = get_base_domain(url)
            if domain in domain_counters:
                domain_counters[domain] += 1
            else:
                domain_counters[domain] = 1

            res_filename = domain + str(domain_counters[domain])
            html = mock_resource_with(res_filename, 'html')
            try:
                a = Article(url)
                a.download(html)
                a.parse()
            except Exception:
                print('<< URL: %s parse ERROR >>' % url)
                traceback.print_exc()
                continue

            correct_text = mock_resource_with(res_filename, 'txt')
            condensed_url = url[:30] + ' ...'
            print('%s -- fulltext status: %s' %
                  (condensed_url, a.text == correct_text))

Exemple #28

0

Afficher le fichier

Fichier : createIndex.py Projet : shawncaeiro/persoNews

def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"

Exemple #29

0

Afficher le fichier

Fichier : cheapskate.py Projet : mskog/cheapskate

def get_details():
    url = request.args.get('url', '')
    if not url:
      abort(400)

    if is_image(url):
      result = {
        "url": url,
        "top_image": url,
        "text": "",
      }
      return jsonify(result)

    article = Article(url)
    article.download()

    try:
      article.parse()
    except (IOError, UnicodeDecodeError):
      return '', 422

    try:
      top_image = article.top_image.rsplit('?',1)[0]
    except AttributeError:
      top_image = ''

    result = {
      "url": url,
      "top_image": top_image,
      "text": article.text,
    }

    return jsonify(result)

Exemple #30

0

Afficher le fichier

Fichier : web_import.py Projet : allyjweir/lackawanna

def get_article(url):
    a = Article(url)
    a.download()
    a.parse()

    article = dict()

    article['title'] = a.title
    article['publish_date'] = a.published_date
    article['authors'] = a.authors
    article['lead_image'] = a.top_image
    article['movies'] = a.movies
    article['text'] = a.text
    article['keywords'] = get_keywords(a.text)


    # This is more likely to fail.
    # try:
    #     article.nlp()
    #     article['summary'] = 'This summary is generated: \n ' + a.summary
    # except Exception:
    #     print Exception
    #     article['summary'] = a.summary

    return article

Exemple #31

0

Afficher le fichier

 def analyse_web_page_article(self, url):
     article = Article(url)
     article.download()
     article.parse()
     return article, self.analyse(article.text)

Exemple #32

0

Afficher le fichier

Fichier : live.py Projet : TheGeekyMan/Stuff

 def livearticles():
     bs_live = feedparser.parse("https://economictimes.indiatimes.com/rssfeedstopstories.cms")
     bs_compare = bs_live.entries[0].published
     hindu_live = feedparser.parse("https://indianexpress.com/section/world/feed/")
     hindu_compare = hindu_live.entries[0].published
     # bl_live = feedparser.parse("https://www.thehindubusinessline.com/feeder/default.rss")
     # bl_compare = bl_live.entries[0].published
     var = 1
     with open('NewsArticles.csv', 'a+', newline='', encoding="UTF-8") as file:
         writer = csv.writer(file)
         while var == 1:
             print("-----------------------------------------------------------------------------------------------------------")
             bs_live = feedparser.parse("https://economictimes.indiatimes.com/rssfeedstopstories.cms")
             for entry in bs_live.entries:
                 if entry.published > bs_compare:
                     url = entry.link
                     article = Article(url)
                     article.download()
                     article.parse()
                     article.nlp()
                     title = article.title
                     summary = article.summary
                     keywords = ', '.join(article.keywords)
                     date = entry.published
                     writer.writerow([title, date, keywords, summary, url])
                     print(entry.title)
                     print(entry.published)
                     print("the business-standard")
                     t = time.localtime()
                     current_time = time.strftime("%H:%M:%S", t)
                     print(current_time)
                 else:
                     bs_compare = bs_live.entries[0].published
                     break
             print("-----------------------------------------------------------------------------------------------------------")
             hindu_live = feedparser.parse("https://indianexpress.com/section/world/feed/")
             for entry in hindu_live.entries:
                 if entry.published > hindu_compare:
                     url = entry.link
                     article = Article(url)
                     article.download()
                     article.parse()
                     article.nlp()
                     title = article.title
                     summary = article.summary
                     keywords = ', '.join(article.keywords)
                     date = entry.published
                     writer.writerow([title, date, keywords, summary, url])
                     print(entry.title)
                     print(entry.published)
                     print("the hindu")
                     t = time.localtime()
                     current_time = time.strftime("%H:%M:%S", t)
                     print(current_time)
                 else:
                     hindu_compare = hindu_live.entries[0].published
                     break
             print("-----------------------------------------------------------------------------------------------------------")
             # bl_live = feedparser.parse("https://www.thehindubusinessline.com/feeder/default.rss")
             # for entry in bl_live.entries:
             #     if entry.published > bl_compare:
             #         url = entry.link
             #         article = Article(url)
             #         article.download()
             #         article.parse()
             #         article.nlp()
             #         title = article.title
             #         summary = article.summary
             #         keywords = ', '.join(article.keywords)
             #         date = entry.published
             #         writer.writerow([title, date, keywords, summary, url])
             #         print(entry.title)
             #         print(entry.published)
             #         print("the business line")
             #         t = time.localtime()
             #         current_time = time.strftime("%H:%M:%S", t)
             #         print(current_time)
             #     else:
             #         bl_compare = bl_live.entries[0].published
             #         break
             time.sleep(5)

Exemple #33

0

Afficher le fichier

pip3 install newspaper3k
'''

from newspaper import Article

import sys

try:
    url = sys.argv[1]
except IndexError:
    print('[X] please enter url')
    sys.exit(0)

print('[*] url: %s' % url)

print('[F] ----- download & parse -----')
article = Article(url, language='zh')
article.download()
article.parse()

print('[*] authors: %s' % article.authors)
print('[*] publish_date: %s' % article.publish_date)
print('[*] text: %s' % article.text)
print('[*] top_image: %s' % article.top_image)
print('[*] movies: %s' % article.movies)

article.nlp()
print('[F] ----- article.nlp() -----')
print('[*] keywords: %s' % article.keywords)
print('[*] summary: %s' % article.summary)

Exemple #34

0

Afficher le fichier

Fichier : fetch_words_from_news.py Projet : inceabdullah/Flask_Server

def fetch_words_from_news(url, translate_TF=False):
    # return [[],[]] 0: en, 1:tr
    article = Article(url)
    article.download()
    article.parse()

    for_nltk = []
    news_text = article.text
    for_nltk.append(article.text)
    news_text = news_text.upper()
    news_text_wo_rn = news_text.replace('\n', ' ')
    news_text_wo_rn = news_text_wo_rn.replace('\r', ' ')
    news_text_list = news_text_wo_rn.split(' ')
    news_text_list = set(news_text_list)
    tokenized_sents = [word_tokenize(i) for i in for_nltk]

    # remove punctuations from list

    res = []
    new_res = []

    #s.translate(None, string.punctuation)

    #res = [s.translate(str.maketrans('', '', string.punctuation)) for s in tokenized_sents[0]

    for tixt in tokenized_sents[0]:
        new_tixt = ''.join(
            c.translate(str.maketrans('', '', string.punctuation + '“”'))
            for c in tixt if c not in string.punctuation + '“”')
        res.append(new_tixt)

    for d in res:
        if not d == '':
            new_res.append(d)

    capitalized_new_res = [KAP.upper() for KAP in new_res]

    capitalized_setted_new_res = set(capitalized_new_res)

    # delete one len item

    more_than_one_len_CSNR = []

    for e in capitalized_setted_new_res:
        if not len(e) < 2:
            more_than_one_len_CSNR.append(e)

    # delete numbers

    digitless_more_than_OLC = []

    for g in more_than_one_len_CSNR:
        if g.isalpha():
            digitless_more_than_OLC.append(g)

    tags_of_diggless = [nltk.pos_tag(f) for f in digitless_more_than_OLC]
    tags_of_diggless_2 = nltk.pos_tag(digitless_more_than_OLC)

    prepless_digitless_MTO = []

    for h in digitless_more_than_OLC:
        if not h.lower() in stop_words:
            prepless_digitless_MTO.append(h)

    if_word_in_cor_PDMTO = []
    TR_if_word_in_cor_PDMTO = []

    for g in prepless_digitless_MTO:
        if g.lower() in words.words():
            if_word_in_cor_PDMTO.append(g)
            transed = ""
            if not translate_TF == False:
                transed = translate.translate(g, 'en-tr')  # tr.set_text(g)
                transed = transed['text'][0]
            TR_if_word_in_cor_PDMTO.append(transed)  #  tr.translate()
            #yazi = transed['text'][0]+"\\r\\n"
            #with open('log.txt', 'a') as file:
            #    file.write(yazi)

    return [if_word_in_cor_PDMTO,
            TR_if_word_in_cor_PDMTO]  # return [[],[]] 0: en, 1:tr

Exemple #35

0

Afficher le fichier

Fichier : newspaper_extraction.py Projet : sharp0111/DS-GoodNews

def newspaperize(article_url):
    """Takes a string url that contains an article. Returns a Story object from 
    models.py containing information scraped from the article located at the url."""

    article = Article(article_url)  # create Article object

    print("Downloading:", article_url)

    try:  # returns None if url fails to download
        article.download()
    except:
        print("Failed to download url:", article_url)
        return None

    try:  # returns None if url cannot be parsed
        article.parse()
    except:
        print("Failed to parse url:", article_url)
        return None

    article.nlp()

    # variables to hold values for Story attributes
    headline = article.title
    imageurl = article.top_image
    timestamp = article.publish_date
    content = article.text
    keywords = article.keywords
    summary = article.summary
    description = article.meta_description
    clickbait = -1  # placeholder for clickbait label

    # populates keyword object with article.keywords
    list_of_keyword_obj = []
    for word in keywords:
        if word not in stopword:  # prevents stopwords from being keywords
            k = Keyword()
            k.keyword = word
            list_of_keyword_obj.append(k)

    s = Story()  # create Story object

    # set attributes
    s.name = headline
    s.imageurl = imageurl
    s.url = article_url
    current_time = datetime.datetime.now()

    if timestamp is not None:
        s.timestamp = timestamp.isoformat()
    else:  # generate timestamp if none found
        s.timestamp = current_time

    s.description = description
    s.keywords = list_of_keyword_obj
    s.summary = summary
    s.content = content
    s.clickbait = clickbait
    s.createtime = current_time

    return s

Exemple #36

0

Afficher le fichier

    def listArticles(self):
        if (self.top_headlines["totalResults"] > 0):
            if (len(self.artList) != 0):
                self.artList = []
            if (self.lim > 0
                    and self.lim <= self.top_headlines['totalResults']):
                for i in range(self.lim):
                    art = self.top_headlines['articles'][i]

                    # Extract Text
                    new = Article(art['url'])
                    new.download()
                    new.parse()
                    storyText = "".join(
                        filter(lambda x: x in string.printable, new.text))
                    descr = "".join(
                        filter(lambda x: x in string.printable,
                               art['description']))
                    title = "".join(
                        filter(lambda x: x in string.printable, art['title']))

                    # Sentiment Analysis
                    document = types.Document(
                        content=storyText, type=enums.Document.Type.PLAIN_TEXT)
                    sentiment = client.analyze_sentiment(
                        document=document).document_sentiment

                    newStory = Story(url=art['url'],
                                     title=title,
                                     source=art['source']['name'],
                                     text=storyText,
                                     author=art['author'],
                                     imageURL=art['urlToImage'],
                                     date=art['publishedAt'][:10],
                                     des=descr,
                                     sent=sentiment.score,
                                     mag=sentiment.magnitude)
                    self.artList.append(newStory)
            else:
                for art in self.top_headlines["articles"]:
                    new = Article(art['url'])
                    new.download()
                    new.parse()
                    storyText = "".join(
                        filter(lambda x: x in string.printable, new.text))
                    descr = "".join(
                        filter(lambda x: x in string.printable,
                               art['description']))
                    title = "".join(
                        filter(lambda x: x in string.printable, art['title']))

                    # Sentiment Analysis
                    document = types.Document(
                        content=storyText, type=enums.Document.Type.PLAIN_TEXT)
                    sentiment = client.analyze_sentiment(
                        document=document).document_sentiment

                    newStory = Story(url=art['url'],
                                     title=title,
                                     source=art['source']['name'],
                                     text=storyText,
                                     author=art['author'],
                                     imageURL=art['urlToImage'],
                                     date=art['publishedAt'][:10],
                                     des=descr,
                                     sent=sentiment.score,
                                     mag=sentiment.magnitude)

                    self.artList.append(newStory)
        else:
            print("There were no articles with the query :", self.keyword)

Exemple #37

0

Afficher le fichier

 }
 for entry in d.entries:
     # Check if publish date is provided, if no the article is skipped.
     # This is done to keep consistency in the data and to keep the script from crashing.
     if hasattr(entry, 'published'):
         if count > LIMIT:
             break
         article = {}
         article['link'] = entry.link
         date = entry.published_parsed
         article['published'] = datetime.fromtimestamp(
             mktime(date)).isoformat()
         try:
             content = Article(entry.link)
             content.download()
             content.parse()
         except Exception as e:
             # If the download for some reason fails (ex. 404) the script will continue downloading
             # the next article.
             print(e)
             print("continuing...")
             continue
         article['title'] = content.title
         article['text'] = content.text
         article['authors'] = content.authors
         article['top_image'] = content.top_image
         article['movies'] = content.movies
         newsPaper['articles'].append(article)
         articles_array.append(article)
         print(count, "articles downloaded from", company, ", url: ",
               entry.link)

Exemple #38

0

Afficher le fichier

from newspaper import Article

# In[2]:

# Grabs the urls
url1 = 'https://www.washingtonpost.com/technology/2019/07/17/you-downloaded-faceapp-heres-what-youve-just-done-your-privacy/'
url2 = 'https://www.marketwatch.com/story/stock-markets-historic-bounce-may-signal-near-term-bottom-but-a-retest-of-the-low-like-1987-and-2008-is-still-a-possibility-2020-03-25?mod=home-page'
article1 = Article(url1)
article2 = Article(url2)

# In[5]:

# NLP, Natural language processesing
article1.download()
article2.download()
article1.parse()
article2.parse()
nltk.download('punkt')
article1.nlp()
article2.nlp()

# In[6]:

# Authors
article1.authors
article2.authors

# In[8]:

# publish date
article1.publish_date

Exemple #39

0

Afficher le fichier

Fichier : main.py Projet : Zafuzi/newlyreadv4

def getArticle(url=None, category=None):
    url = request.args.get('url')
    url_string = url.replace(':', '')

    try:
        ip = request.environ['REMOTE_ADDR']
        if request.headers.get('X-Forwarded-For'):
            ip = request.headers.get('X-Forwarded-For', ip)
        print(str.format("IP: {0}, Article: {1}", ip, url))
        print("ROUTE: " + request.access_route[-1])
    except:
        print("ERROR GETTING IP ADDRESS OR KEY")

    category = request.args.get('category')

    isHTML = False

    title = ""
    html = ""
    img = ""
    movies = []
    try:
        for key in r.keys(pattern="html:" + category + ":" + url_string):
            data = r.get(key)
            data = json.loads(data)
            isHTML = True
    except:
        print("Error fetching keys for article: " + url)

    if category:
        if isHTML:
            title = data['title']
            html = data['html']
            img = data['img']
            movies = data['movies']
            print("LOADED FROM DB")
        else:
            article = Article(url, keep_article_html=True)
            article.download()
            article.parse()
            title = article.title
            html = article.article_html
            img = article.top_image
            movies = article.movies
            print("CATEGROY: ", category)
            print("Title: ", article.title)
            r.set(
                'html:' + category + ":" + url_string,
                json.dumps({
                    "title": title,
                    "html": html,
                    "img": img,
                    "movies": movies
                }))

    return render_template("article.html",
                           url=url,
                           title=title,
                           body=Markup(html),
                           header_image=img,
                           video=movies)

Exemple #40

0

Afficher le fichier

def find_similar_articles(news):
    news_article = Article(news)
    news_article.download()
    news_article.parse()
    news_title_tokenized = news_title_tokenization(
        preproccess_text(news_article.title))

    search_title = ""
    for word in news_title_tokenized:
        search_title = search_title + word + " "

    num_page_searched = 4
    search_results = google.search(search_title, num_page_searched)

    found_similar_article = 0
    for result in search_results:
        flag = 0
        search_result_title = result.name.split('http')[0]
        search_result_title = remove_unnecessary_noise(
            search_result_title.split('...')[0])
        search_result_title = preproccess_text(search_result_title)
        search_result_title = news_title_tokenization(search_result_title)

        result_string = ""
        for w in search_result_title:
            result_string = result_string + w + " "

        corpus = []
        corpus.append(search_title)
        corpus.append(result_string)

        vectorizer = CountVectorizer()
        features = vectorizer.fit_transform(corpus).todense()

        for f in features:
            dist = euclidean_distances(features[0], f)

        if dist < 1:
            found_similar_article = found_similar_article + 1

    news_article_text = preproccess_text(news_article.text)
    news_article_text = news_title_tokenization(news_article_text)

    article_result_string = ""
    for w in news_article_text:
        article_result_string = article_result_string + w + " "

    found_similar_article_body = 0
    search_result_link = search(search_title,
                                tld="com",
                                num=10,
                                stop=1,
                                pause=2)
    for link in search_result_link:
        check_news_article = Article(link)
        check_news_article.download()
        check_news_article.parse()

        check_news_article_text = preproccess_text(check_news_article.text)
        check_news_article_text = news_text_tokenization(
            check_news_article_text)

        check_article_result_string = ""
        for w in check_news_article_text:
            check_article_result_string = check_article_result_string + w + " "

        article_corpus = []
        article_corpus.append(article_result_string)
        article_corpus.append(check_article_result_string)

        article_vectorizer = CountVectorizer()
        article_features = article_vectorizer.fit_transform(
            article_corpus).todense()

        for f in article_features:
            article_dist = euclidean_distances(article_features[0], f)

        if article_dist < 0:
            found_similar_article = found_similar_article - 1

    if found_similar_article > 1:
        print('Found similar article titles!')
    elif found_similar_article == 1:
        print('Found a similar article title!')
    else:
        print('No similar article titles found!')

Exemple #41

0

Afficher le fichier

Fichier : views.py Projet : hanakhry/django-news-aggregator

def fetch_news(news_cat):
    categories = {
        'TPS': 'top_stories_url',
        'ENT': 'entertainment_url',
        'BSN': 'business_url',
        'SPR': 'sports_url',
        'TCH': 'tech_url',
    }

    sources = models.NewsSourceModel.objects.values()
    news_links = []
    for source in sources:

        # Extracts news links from XML Feed of News website
        f = feedparser.parse(source[categories[news_cat]])
        MAX_LINKS = 5  # Max number of links to extract from each News Source
        for i in range(MAX_LINKS):

            try:
                # Parse links and extract Keywords and Summary using NLP
                article = Article(f['entries'][i]['link'])
                article.download()
                article.parse()
                article.nlp()

                news_links.append({
                    'url': f['entries'][i]['link'],
                    'keywords': article.keywords,
                    'summary': article.summary,
                    'news_source_id': source['id'],
                    'title': f['entries'][i]['title'],
                })
            except:
                continue

    # Now compare News links for duplicates
    pop_indexes = []
    for i in range(len(news_links)):
        list1 = news_links[i]['keywords']

        if i <= (len(news_links) - 2):
            remaining_list = news_links[i + 1:]
            for element in remaining_list:
                list2 = element['keywords']
                match_percentage = match_lists(list1, list2)

                if match_percentage >= 50:
                    pop_indexes.append(news_links.index(element))

    # Pop the duplicate elements
    for pop_index in pop_indexes:
        news_links.pop(pop_index)

    # Now store the links in the database
    q, _ = models.CategoryUrlsModel.objects.get_or_create(news_cat=news_cat)
    for news_link in news_links:
        q.urls.create(
            url=news_link['url'],
            news_source_id=news_link['news_source_id'],
            keywords=news_link['keywords'],
            summary=news_link['summary'],
            title=news_link['title'],
        )
    q.save()

Exemple #42

0

Afficher le fichier

html = req.text
soup = BeautifulSoup(html, "lxml")

articles = []

list_default = soup.find('ul', {"class": "list_default"})
li = list_default.findAll('li')

for i in range(page_size - 1, page_size * 3):
    article = {}
    link = li[i].find('a')['href']

    a = Article(link, language="ko")
    a.download()
    a.parse()

    article["title"] = a.title
    article["content"] = a.text
    article["image"] = a.top_image
    article["company"] = '중앙일보'
    article["date"] = a.publish_date

    regex = re.compile("[가-힣]{2,5} (기자|인터넷 저널리스트)")
    author = regex.search(a.text)
    article["author"] = author.group()
    articles.append(article)

    print(article)

# In[ ]:

Exemple #43

0

Afficher le fichier

Fichier : pymyTest.py Projet : hyejinhyun/NLPG_hackathon

def get_art_body(URL):
  a = Article(URL,language='ko')
  a.download()
  a.parse()
  return (a.title,a.text)

Exemple #44

0

Afficher le fichier

Fichier : ArticleScraper.py Projet : alwinyen/hackillinois-backend

def getArticle(url):
    article = Article(url)
    article.download()
    article.parse()

    return article

Exemple #45

0

Afficher le fichier

Fichier : getTextParts.py Projet : Hackathon-FakePage/Hack_the_Crisis

def getTitle(url):
    article = Article(url)
    article.download()
    article.html
    article.parse()
    return article.title

Exemple #46

0

Afficher le fichier

Fichier : extraction.py Projet : rohansheth17/geograpy

 def set_text(self):
     if not self.text and self.url:
         a = Article(self.url)
         a.download()
         a.parse()
         self.text = a.text

Exemple #47

0

Afficher le fichier

def triggers(request):
    if request.method == 'POST':
        print(request.POST)
        data = dict(request.POST)
        # Driver Code
        key = 'show_details'
        one = checkKey(data, key)
        key = 'check_triggers'
        two = checkKey(data, key)
        key = 'show_wordcloud'
        three = checkKey(data, key)
        key = 'hate_speech'
        four = checkKey(data, key)
        print(one, two, three)
        #URL Link case
        if (one == True):
            url = data['Link'][0]
            print(url)
            article = Article(url)
            article.download()
            article.parse()
            authors = article.authors
            publishdate = article.publish_date
            #article.text
            article.nlp()
            keywords = article.keywords
            articlesummary = article.summary
            return render(
                request, 'consciousApp/triggers.html', {
                    'authors': authors,
                    'publishdate': publishdate,
                    'keywords': keywords,
                    'articlesummary': articlesummary
                })
        #Show triggers
        elif (two == True):
            text = request.POST['input_text'].lower()
            triggers = [
                "9 11", "9-11", "9/11", "ableism", "abusive", "ageism",
                "alcoholism", "animal abuse", "animal death",
                "animal violence", "bestiality", "gore", "corpse", "bully",
                "cannibal", "car accident", "child abuse", "childbirth",
                "classism", "death", "decapitation", "abuse", "drug", "heroin",
                "cocaine", "eating disorder", "anorexia", "binge eating",
                "bulimia", "fatphobia", "forced captivity", "holocaust",
                "hitler", "homophobia", "hostage", "incest", "kidnap",
                "murder", "nazi", "overdose", "pedophilia", "prostitution",
                "PTSD", "racism", "racist", "rape", "raping", "scarification",
                "self-harm", "self harm", "cutting", "sexism", "slavery",
                "slurs", "suicide", "suicidal", "swearing", "terminal illness",
                "terrorism", "torture", "transphobia", "violence", "warfare"
            ]
            tw = []
            text_file = open(
                './consciousApp/static/consciousApp/input/triggercheckdata.txt',
                'w+')
            text_file.write(str(text))
            text_file.close()
            for trigger in triggers:
                if text.find(trigger) > -1: tw.append(trigger)
            if tw == []: tw.append('No Triggers Found')
            return render(request, 'consciousApp/triggers.html', {
                'text': text,
                'triggers': tw,
                'data': data
            })
        #Show_cloud
        elif (three == True):
            text = request.POST['input_text'].lower()
            tokens = word_tokenize(text)
            textdata = nltk.Text(tokens)
            stopwords = set(STOPWORDS)
            wordcloud = WordCloud(stopwords=stopwords,
                                  max_font_size=50,
                                  max_words=100,
                                  background_color="white").generate(text)
            wordcloud.to_file(
                "./consciousApp/static/consciousApp/output/word-cloud.png")
            data = "./../../static/consciousApp/output/word-cloud.png"
            return render(request, 'consciousApp/triggers.html',
                          {'data': data})

        elif (four == True):
            sonar = Sonar()
            text = request.POST['input_text'].lower()
            url = data['Link'][0]
            data = sonar.ping(text=text)["classes"]
            hate_speech = data[0]
            hate_speech_confidence = hate_speech["confidence"] * 100
            offensive_language = data[1]
            offensive_language_confidence = offensive_language[
                "confidence"] * 100
            neither = data[2]
            neither_confidence = neither["confidence"] * 100
            print(type(data))
            print(offensive_language_confidence * 100,
                  hate_speech_confidence * 100, neither_confidence * 100)
            return render(
                request, 'consciousApp/triggers.html', {
                    'hate_speech_confidence': hate_speech_confidence,
                    'offensive_language_confidence':
                    offensive_language_confidence,
                    'neither_confidence': neither_confidence
                })
    else:

        return render(request, 'consciousApp/triggers.html')

Exemple #48

0

Afficher le fichier

def scrap(link):
    article = Article(link)
    article.download()
    article.parse()
    return article.text

Exemple #49

0

Afficher le fichier

Fichier : hindustan_times.py Projet : clue1ess/Leo-Platform

def HindustanTimesScrapper():

    SRC = KNOWN_NEWS_SOURCES["Hindustan Times"]

    data1 = get_chronological_headlines(SRC["pages"].format(1))
    data2 = get_trending_headlines(SRC["home"])
    text_lst = []
    url_lst = []
    date_lst = []
    title_lst = []
    try :
        for data in data1:
            if data["content"] == "NA":
                try:
                    article = Article(data["link"])
                    article.download()
                    article.parse()
                    article.nlp()
                    summary = article.text
                    text_lst.append(summary)
                except:
                    text_lst.append(data["content"])
            else:
                text_lst.append(data["content"])
            url_lst.append(data["link"])
            date = data["published_at"]
            if(date == None) :
                date = datetime.now()
            date_lst.append(date)
            try :
                title_lst.append(data["title"])
            except:
                title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0])
        for data in data2:
            if data["content"] == "NA":
                try:
                    article = Article(data["link"])
                    article.download()
                    article.parse()
                    article.nlp()
                    summary = article.text
                    text_lst.append(summary)
                except:
                    text_lst.append(data["content"])
            else:
                text_lst.append(data["content"])
            url_lst.append(data["link"])
            date = data["published_at"]
            if(date == None) :
                date = datetime.now()
            date_lst.append(date)
            try :
                title_lst.append(data["title"])
            except:
                title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0])

        df_raw = pd.DataFrame(list(zip(text_lst, url_lst, date_lst, title_lst)), columns=["text", "url", "date", "headline"])

        df_crime = get_crime(df_raw)
        data = get_data("./database/data.json")
        df = get_location(df_crime, data)
        df = preprocessing2(df, data)
        return df.reset_index(drop=True)
    except :
        return pd.DataFrame(columns=["index","text","url","crime","location","region","city","date","headline"])

Exemple #50

0

Afficher le fichier

# Import the libraries
from newspaper import Article
import nltk
from gtts import gTTS
import os

# Get the article
article = Article('https://www.poetryfoundation.org/poems/46945/baa-baa-black-sheep')

article.download()  # Download the article
article.parse()  # Parse the article
nltk.download('punkt')  # Download the 'punkt' package
article.nlp()  # Apply Natural Language Processing (NLP)

# Get the articles text
mytext = article.text

# Print the text
print(mytext)

# Language in which you want to convert
# language = 'pt-br' #Portuguese (Brazil)
language = 'en'  # English

# Passing the text and language to the engine,
# here we have marked slow=False. Which tells
# the module that the converted audio should
# have a high speed
myobj = gTTS(text=mytext, lang=language, slow=False)

# Saving the converted audio in a mp3 file named

Exemple #51

0

Afficher le fichier

Fichier : untitled0.py Projet : pgambhir0102/ACM-Project-T3

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import yaml
import pyaudio
import speech_recognition as sr

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
### website from where we want to extract the data
article1 = Article('https://en.wikipedia.org/wiki/Coronavirus')
article1.download()
article1.parse()
article1.nlp()

article2 = Article('https://www.euro.who.int/en/health-topics/noncommunicable-diseases/mental-health/data-and-resources/mental-health-and-covid-19')
article2.download()
article2.parse()
article2.nlp()

article3 = Article('https://www.healthline.com/health-news/what-covid-19-is-doing-to-our-mental-health')
article3.download()
article3.parse()
article3.nlp()

article4 = Article('https://www.webmd.com/lung/coronavirus')
article4.download()
article4.parse()

Exemple #52

0

Afficher le fichier

from newspaper import Article

# A new article from TOI
url = "http://world.people.com.cn/n1/2019/0308/c1002-30964972.html"

# For different language newspaper refer above table
toi_article = Article(url, language='zh')  # zh for China

# To download the article
toi_article.download()

# To parse the article
toi_article.parse()

# To perform natural language processing ie..nlp
# toi_article.nlp()

# To extract title
print("Article's Title:")
print(toi_article.title)
print("*" * 80)

# To extract text
print("Article's Text:")
print(toi_article.text)
print("*" * 80)

# To extract summary
print("Article's Summary:")
print(toi_article.summary)
print("*" * 80)

Exemple #53

0

Afficher le fichier

import spacy
from newspaper import Article
nlp = spacy.load("en_core_web_sm")

url_1 = 'https://www.wsj.com/articles/u-s-officials-walk-out-of-meeting-at-presidential-palace-in-kabul-11553628051'
url_2 = 'https://www.wsj.com/articles/iran-moves-to-cement-its-influence-in-syria-11553632926'
article_1 = Article(url_1)
article_2 = Article(url_2)
article_1.download()
article_2.download()
article_1.parse()
article_2.parse()

article_stream = [article_1.text, article_2.text]

for doc in nlp.pipe(article_stream, batch_size=50):
    print(doc.vocab)

# for entity in doc.ents:
#     print(entity.text, entity.start_char, entity.end_char, entity.label_)

Exemple #54

0

Afficher le fichier

    def crawling(self, category_name):
        # Multi Process PID
        print(category_name + " PID: " + str(os.getpid()))    

        writer = Writer(category_name=category_name, date=self.date)
        # 기사 URL 형식
        if (category_name == "연합뉴스속보"):
            url = "http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&sid1=001&sid2=140&oid=001&isYeonhapFlash=Y" \
                  + "&date="

        else:
            url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
                self.categories.get(category_name)) + "&date="

        # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
        day_urls = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])
        print(category_name + " Urls are generated")
        print("The crawler starts")

        for URL in day_urls:
            print(URL)
            regex = re.compile("date=(\d+)")
            news_date = regex.findall(URL)[0]

            request = self.get_url_data(URL)
            document = BeautifulSoup(request.content, 'html.parser')
            
            # html - newsflash_body - type06_headline, type06
            # 각 페이지에 있는 기사들 가져오기
            if (category_name == "연합뉴스속보"):
                post_temp = document.select('.newsflash_body .type02 li ')

            else:
                post_temp = document.select('.newsflash_body .type06_headline li dl')
                post_temp.extend(document.select('.newsflash_body .type06 li dl'))
           
            # 각 페이지에 있는 기사들의 url 저장
            post = []
            headlines = []
            companys = []

            
            for line in post_temp:
                post.append(line.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음
                try:
                    companys.append(line.find('span', class_="writing").text)
                except:
                    companys.append("err")
                try:
                    h = line.find_all('a')
                    if len(h) > 1:
                        headlines.append(h[1].text)
                    elif len(h) == 1:
                        headlines.append(h[0].text)
                    else:
                        headlines.append("err")
                except:
                    headlines.append("err")
            del post_temp
        
            
            print(len(post))

            for i in range(len(post)):  # 기사 URL
                # 크롤링 대기 시간
                print(i)
                sleep(0.01)
                content_url = post[i]
                
                # 기사 HTML 가져옴
                try:
                    article = Article(content_url, language='ko')
                    article.download()
                    article.parse()
                    text_sentence = article.text.strip()
                    text_company = companys[i]
                    text_headline = headlines[i].strip()
        ######################################################################
                    if self.keyword == 'initvalue':
                        wcsv = writer.get_writer_csv()
                        wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
                    else:
                        headline_to_words = text_headline.split()
                        if headline_to_words.index(self.keyword) >= 0:
                            wcsv = writer.get_writer_csv()
                            wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
        ######################################################################

                            
                except Exception as err:
                    print(err)
        
        writer.close()
        return

Exemple #55

0

Afficher le fichier

Fichier : flask_app.py Projet : abdullah-a-hasan/Aspire

def align_row_text():
    source_text = request.form['source_text']
    target_text = request.form['target_text']
    # check if source and target are urls
    url_rex = r"(?i)\b(?:(?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\))+(?:\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    if re.fullmatch(url_rex, source_text.strip().lower()):
        src_article = Article(source_text.strip())
        src_article.download()
        src_article.parse()
        source_text = src_article.title + "\n" + src_article.text
    if re.fullmatch(url_rex, target_text.strip().lower()):
        tar_article = Article(target_text.strip())
        tar_article.download()
        tar_article.parse()
        target_text = tar_article.title + "\n" + tar_article.text

    # segment source and target

    src_lang_code = lang_detect.detect(source_text)
    tar_lang_code = lang_detect.detect(target_text)

    if src_lang_code == 'zh-cn':
        srx_src_code = 'Generic'
    else:
        srx_src_code = src_lang_code

    if tar_lang_code == 'zh-cn':
        srx_tar_code = 'Generic'
    else:
        srx_tar_code = tar_lang_code

    srx_rules = srx_segmenter.parse(srx_file_path)
    seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_src_code],
                                             source_text)
    source_list = seg_results.extract()[0]
    seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_tar_code],
                                             target_text)
    target_list = seg_results.extract()[0]
    # translate target
    target_mt_list = mt_helpers.google_translate_chunk_by_chunk(
        target_list, tar_lang_code, src_lang_code)
    # align
    # initiate the alignment class
    algorithm = request.form.get('algorithm', 'fuzzy')
    align_options = {
        "location_weight":
        float(request.form.get('input_location_weight', 0.2)),
        "length_weight":
        float(request.form.get('input_length_weight', 0.1)),
        "meta_weight":
        float(request.form.get('input_length_weight', 0.1)),
        "semantic_weight":
        float(request.form.get('input_semantic_weight', 0.6)),
        "search_range":
        float(request.form.get('input_paragraph_size', 5)),
        "minimum_semantic_score":
        float(request.form.get('input_minimum_semantic_score', 0.5)),
        "minimum_partial_sem_match":
        0.1,
        "minimum_length_score":
        float(request.form.get('input_minimum_length_score', 0.6))
    }

    if algorithm == 'fuzzy':
        semantic_class = fuzzy_comp.FuzzyComp
    else:
        semantic_class = tfidf_scikit.TfidfComp
    alg = TranslationAligner()

    alg.align(semantic_class,
              source_list,
              target_list, [],
              target_mt_list,
              options=align_options)
    # save json file to a random file name under static files and return it with the results
    temp_file_name = ''.join(
        random.choices(string.ascii_uppercase + string.digits, k=10))
    temp_json_file_name = temp_file_name + ".json"
    alg.export_json_dict(os.path.join(export_path, temp_json_file_name))
    del alg
    return {"json_file_name": temp_json_file_name}

Exemple #56

0

Afficher le fichier

Fichier : app.py Projet : mohamedluqman/mani693_4fbe

def getData():
    url = request.args.get('url')
    # From Newspaper Framework getting required data
    content = Article(url)
    content.download()
    content.parse()
    title = content.title
    rawText = content.text
    # Unformatted Data to show to user
    textDisplay = rawText.split("\n\n")
    textDisplay = ''.join(textDisplay)
    # Converting numbered text to digits
    t2d = text2digits.Text2Digits()
    numText = t2d.convert(rawText)
    text = numText.split("\n\n")
    text = ''.join(text)
    # Implemented API data limit restriction
    if len(text) < 5000:
        text = text
    else:
        text = text[:5000]
    jsonData = {"text": text}
    configDataResource = os.path.join(SITE_ROOT, "data", "configdata.json")
    configData = json.load(open(configDataResource))

    # NER API call request
    headers = {
        'x-api-key': configData["X_API_KEY"],
        'Content-type': 'application/json'
    }
    ner_response = requests.post(
        configData["NAMED_ENTITY_RECOGNITION_ENDPOINT"],
        headers=headers,
        data=json.dumps(jsonData))
    # print(ner_response.text)
    # Deserializing the response
    places = lambda: None
    places.__dict__ = json.loads(ner_response.text)
    print(places.LOC)

    json_url = os.path.join(SITE_ROOT, "data", "sg-citi.json")
    data = json.load(open(json_url))

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    LOC = []
    CASE = []
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        if ent.label_ == "CARDINAL":
            CASE.append(ent.text)
        if ent.label_ == "GPE":
            LOC.append(ent.text)

    count = []
    for i in CASE:
        if i.isdigit():
            if i not in count:
                count.append(i)
    print("COUNT: ", count)
    if not len(count):
        count = list(i for i in range(80, 500, 7))
    returnJson = {
        "text": textDisplay,
        "location": [],
        "category": ner_response.text
    }
    for i in places.LOC:
        for citi in data:
            if i in citi["name"] and citi["name"] not in returnJson["location"]:
                returnJson["location"].append({
                    "name":
                    citi["name"],
                    "lat":
                    "no1",
                    "lon":
                    "no2",
                    "count":
                    count[random.randrange(0, len(count))]
                })
                break
    print(returnJson)
    return jsonify(returnJson)

Exemple #57

0

Afficher le fichier

Fichier : app.py Projet : afcarl/DebugPoly

def get_article(url):
    article = Article(url, language='en')
    article.download()
    article.parse()
    return article

Exemple #58

0

Afficher le fichier

Fichier : zhongguomaoyijijixinxiwang_myjjdc.py Projet : TengYang111/baichuanjihua

    def parse_content(self, response):
        #这个函数用作新闻的具体解析

        ID = 'songtengteng'

        website_name = '商务部贸易救济调查局'

        # 网站板块
        website_block = response.xpath(
            "//div[@class='position']/a[2]/text()").extract_first()

        news_url = response.meta['url']

        # 作者
        news_author_list = response.xpath('//script')
        if len(news_author_list) != 0:
            news_author = news_author_list.re(
                'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"')[0][13:].replace(
                    '"', '')
        else:
            news_author = '商务部贸易救济调查局'

        # 新闻发布时间，统一格式：YYYY MM DD HH:Mi:SS           v.{2}\stm\s=\s\".*\"
        publish_time = response.meta['publish_time']
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-8:]
        publish_time = year + month + day + ' ' + juti_time

        # 新闻自带标签
        news_tags = response.xpath('//script').re(
            'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '')

        # 新闻标题
        news_title = response.xpath('//h3/text()').extract_first()

        # 新闻正文
        a = Article(response.url, language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        #获取文章的图片和名称
        image_urls = []
        image_names = []
        image_urls1 = response.xpath(
            '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src'
        ).extract()
        if image_urls1 != []:
            image_urls = image_urls1
            for i in range(len(image_urls)):
                if i < 10 and i >= 0:
                    image_name = news_title + '_000' + str(i)
                    image_names.append(image_name)
                elif i < 100 and i >= 10:
                    image_name = news_title + '_00' + str(i)
                    image_names.append(image_name)
                elif i < 1000 and i >= 100:
                    image_name = news_title + '_0' + str(i)
                    image_names.append(image_name)
                else:
                    image_name = news_title + str(i)
                    image_names.append(image_name)

        yield self.getItem(
            id=ID,
            news_url=news_url,
            website_name=website_name,
            website_block=website_block,
            news_title=news_title,
            publish_time=publish_time,
            news_author=news_author,
            news_tags=news_tags,
            news_content=news_content,
            image_urls=image_urls,
            image_names=image_names,
        )

Exemple #59

0

Afficher le fichier

Fichier : main.py Projet : ericcfu/article-emotion-analyzer

def scrape_analyze(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

Exemple #60

0

Afficher le fichier

    def parse_artical(self, response):  # 具体文章解析
        ID = 'songtengteng'

        # 新闻链接
        news_url = response.meta['url']

        # 新闻标题
        news_title = response.xpath('//h1/text()').extract_first()

        # 作者
        a = response.xpath(
            '//div[@class="info-source"]/span/a/text()').extract_first()
        if a == None:
            news_author = ''
        else:
            news_author = a

        # 发布时间
        publish_time = response.xpath(
            '//div[@class="info-source"]/span[2]/text()').extract_first()
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-5:]
        publish_time = year + month + day + ' ' + juti_time + ':' + '00'

        # 正文
        '''可以考虑下使用文章密度算法来快速解析文章正文'''
        a = Article(response.meta['url'], language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        # 标签
        news_tags = ''

        #图片
        image_urls1 = response.xpath('//p[@class="pi"]/img/@src').extract()
        image_urls = []
        image_names = []
        if image_urls1 != []:
            for i in range(len(image_urls1)):
                image_url = image_urls1[i]
                image_urls.append(image_url)
                if i >= 0 and i < 10:
                    image_title = news_title + '000' + str(i)
                elif i >= 10 and i < 100:
                    image_title = news_title + '00' + str(i)
                elif i >= 100 and i < 1000:
                    image_title = news_title + '0' + str(i)
                else:
                    image_title = news_title + str(i)
                image_names.append(image_title)

        yield self.getItem(id=ID,
                           news_url=news_url,
                           website_name='搜狐焦点',
                           website_block='访谈',
                           news_title=news_title,
                           publish_time=publish_time,
                           news_author=news_author,
                           news_tags=news_tags,
                           news_content=news_content,
                           image_urls=image_urls,
                           image_names=image_names)