Exemple #1
0
	def extract(self, article_url):

		article = Article(url=article_url)
		article.download()
		article.parse()

		return article.text
Exemple #2
0
def get_article():
	tree_urls = ET.parse("DB_urls.xml")
	root_urls = tree_urls.getroot()

	# The problem with English and Chinese can be solved with 
	for field_urls in root_urls.findall("row"):
		url_urls = field_urls.find("field").text
	#	url_urls = 'http://news.sina.com.cn/c/2014-04-21/204729980947.shtml'
	#	url_urls = 'http://china.caixin.com/2013-12-30/100623243.html'

		try:
			response = urllib2.urlopen(url_urls)
			status = response.code

			#print "detected webpage code:", status

			if(status == 404):
				continue
			else:
				a_zh = Article(url_urls, language = 'zh')
				a_zh.download()
				a_zh.parse()
				content_urls = a_zh.text

				if(content_urls == ''):
					a_en = Article(url_urls, language = 'en')
					a_en.download()
					a_en.parse()
					content_urls = content_urls + a_en.text

				if(content_urls != ''):
					compare_article(url_urls, content_urls)			
		except:
			pass
Exemple #3
0
 def test_arabic_fulltext_extract(self):
     url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html"
     article = Article(url=url, language="ar")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Exemple #4
0
def parse_article(url, lang, featured=0, db=connect_db()):
    cur = db.execute("select * from articles where url=?", (url,))
    entries = [dict(id=row[0], url=row[1], title=row[2], image=row[3], text=row[4], authors=row[5], date=row[6], featured=row[7], language=row[8]) for row in cur.fetchall()]

    if len(entries) >= 1:
        return entries[0]

    article = Article(url)
    article.download()

    try:
        article.parse()
    except:
        return None

    title = article.title
    image = article.top_image
    text = article.text
    authors = ",".join(article.authors)
    date = int(time.mktime(article.publish_date.timetuple())) if type(article.publish_date) is datetime.datetime else 0

    db.execute("insert into articles (url, title, image, text, authors, date, featured, language) values (?, ?, ?, ?, ?, ?, ?, ?)", (url, title, image, text, authors, date, featured and len(text) >= 50, lang))
    db.commit()

    idquery = db.execute("select (id) from articles where url=?", (url,))
    id = [row[0] for row in idquery.fetchall()][0]

    return {"id": id, "url": url, "title": title, "image": image, "text": text, "authors": authors, "date": date, "language": lang}
Exemple #5
0
 def test_spanish_fulltext_extract(self):
     url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
     article = Article(url=url, language="es")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Exemple #6
0
def get_nlp_data(url):
	article = Article(url)
	article.download()
	article.parse()
	article.nlp()
	
	return json.dumps(article.keywords)
Exemple #7
0
 def test_chinese_fulltext_extract(self):
     url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
     article = Article(url=url, language="zh")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()
Exemple #8
0
def main():
    try:
        headlines = requests.get(headline_url)
        
        headlines = json.loads(headlines.text)
        for headline in headlines['Headlines']:
            print("Processing Article %s" % headline['Url'])
            article = Article(headline['Url'])
            article.download()
            article.parse()
            
            
            response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
            rdf = json.loads(response.text)
            
            for x in rdf:
                if '_type' in rdf[x] and 'name' in rdf[x]:
                    print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
                    for instance in rdf[x]['instances']:
                        text = instance['prefix'] + instance['suffix']
                        blob = TextBlob(text)
                        for sentence in blob.sentences:
                            print(sentence)
                            print(sentence.sentiment.polarity)
            print('--------------------')
            
            #print(rdf)
    except Exception as e:
        print ('Error in connect ' , e)
Exemple #9
0
    def run(self):
        logging.debug("run() - [WAIT]")
        from newspaper import Article

        '''
        Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm
        '''

        NOTES_LIST = [
            '118',
            '117',
            # '116',
            # '115',
        ]
        for note_id in NOTES_LIST:
            note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id)
            note.download()

            print '*' * 100
            # print 'H T M L'
            # print note.html
            #print '*' * 100
            # print 'T E X T'
            note.parse()
            print note.text


        logging.debug("run() - [DONE]")
Exemple #10
0
def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]
Exemple #11
0
def post_new(request):
    if request.method == "POST":
        form = PostForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            post.author = request.user
            post.published_date = timezone.now()
            post.save()
            return redirect('blog.views.post_detail', pk=post.pk)
    elif request.method == 'GET':
        url = request.GET.get('url', '')
               
        if len(url) > 5:
            article = Article(url, language='en')
            article.download()
            article.parse()
            article.nlp()
            image = article.top_image
            summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
            title = article.title.replace(u'\u2019',"\'")
            source = url.split('//')[1].split('/')[0].replace('www.','')
            status = 'UD'
            form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) 
        else:
            form = PostForm() 

    return render(request, 'blog/post_edit.html', {'form': form})
Exemple #12
0
def scrapeURLS(inFilPath):
    texts = []
    cache = loadCache()
    toDelURLs = []
    with open(inFilPath) as f:
        urls = f.readlines()
    for url in urls:
        if filter(urlFilters, url):
            toDelURLs.append(url)
            
        if url in cache:
            txt = cache[url]
        else:
            print "Scraping URL %s" % url
            article = Article(url)
            article.download()
            article.parse()
            txt = article.text.replace("\n", " ").replace("  ", " ").strip()
            if txt == "" or filter(txtFilter, txt):
                toDelURLs.append(url)
                continue
            cacheURL(url, txt)
        texts.append(txt)
        deleteURLs(inFilPath, toDelURLs)
    return texts
Exemple #13
0
 def check_url(args):
     """
     :param (basestr, basestr) url, res_filename:
     :return: (pubdate_failed, fulltext_failed)
     """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed',
                    res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed
Exemple #14
0
    def wrap_newspaper(self, web_page):
        parser = NewspaperArticle(url=web_page.final_url)
        parser.html = web_page.html
        parser.is_downloaded = True
        parser.parse()

        return parser
Exemple #15
0
def f(url):
	url_urls = url.text
	try:
		response = urllib2.urlopen(url_urls)
		status = response.code

		#print "detected webpage code:", status

		if(status == 404):
			pass
		else:
			a_zh = Article(url_urls, language = 'zh')
			a_zh.download()
			a_zh.parse()
			# content_urls = a_zh.text

			# if(content_urls == ''):
			# 	a_en = Article(url_urls, language = 'en')
			# 	a_en.download()
			# 	a_en.parse()
			# 	content_urls = content_urls + a_en.text

			# if(content_urls != ''):
			# 	pass
			# 	# compare_article(url_urls, content_urls)			
	except:
		pass
Exemple #16
0
def extract():
  url = sys.argv[1:].pop()

  a = Article(url, keep_article_html=True)
  a.download()
  a.parse()
  a.nlp()

  parsed_uri = urlparse(a.source_url)
  domain = '{uri.netloc}'.format(uri=parsed_uri)

  try:
    publish_date = a.publish_date.strftime('%Y-%m-%d %H:%M')
  except AttributeError:
    publish_date = ""

  try:
    authors = ", ".join(a.authors)
  except AttributeError:
    authors = ""

  result = {}
  result['html'] = a.html
  result['body'] = a.text
  result['title'] = a.title
  result['top_image'] = a.top_image
  result['author'] = authors
  result['html_body'] = a.article_html
  result['favicon'] = a.meta_favicon
  result['description'] = a.summary
  result['publish_date'] = publish_date
  result['keywords'] = a.keywords
  result['sitename'] = re.sub(r"^www.", "", domain)

  return json.dumps(result).encode('utf-8')
Exemple #17
0
def show_article():
    url_to_clean = request.args.get('url_to_clean')
    if not url_to_clean:
        return redirect(url_for('index'))

    article = Article(url_to_clean)
    article.download()
    article.parse()

    try:
      html_string = ElementTree.tostring(article.clean_top_node)
    except:
      html_string = "Error converting html to string."

    try:
      article.nlp()
    except:
      log.error("Couldn't process with NLP")

    a = {
          'html': html_string, 
         'authors': str(', '.join(article.authors)), 
         'title': article.title,
         'text': article.text,
         'top_image': article.top_image,
         'videos': str(', '.join(article.movies)),
         'keywords': str(', '.join(article.keywords)),
         'summary': article.summary
         }
    return render_template('article/index.html', article=a, url=url_to_clean)
    
Exemple #18
0
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')
Exemple #19
0
def get_news():
    urls = get_urls()
    news = News.query.with_entities(News.source_url).all()

    used_urls = []
    for n in news:
        used_urls.append(n[0])

    for url in urls:
        if not url in used_urls:
            used_urls.append(url)

            article = Article(url, language='pt', keep_article_html=True)
            article.download()
            article.parse()
            article.nlp()

            news_article = News(url)
            news_article.slug = slugify(article.title)
            news_article.title = article.title
            news_article.text = article.text
            news_article.top_image = article.top_image
            news_article.summary = article.summary
            news_article.article_html = article.article_html
            news_article.created_at = datetime.datetime.now()

            exists_this_news = News.query.filter_by(source_url=url).first()

            if not exists_this_news:
                print(url)
                db.session.add(news_article)
                db.session.commit()
Exemple #20
0
    def parse_news(self, response):
        item = ScrapyGooglenewsItem()
        #only log the warning info from request
        logging.getLogger("requests").setLevel(logging.WARNING)

        for href in response.xpath('//h2[@class="title"]/a/@href').extract():
            item['link'] = href
            #use newspaper-0.0.8 to scrape the webpage, then get clean text.
            article = Article(item['link'])
            article.download()
            article.parse()
            item['title'] = article.title
            item['text'] = article.text
            #item['authors'] = article.authors
            #item['date'] = article.publish_date

            if response.url.split('&')[-1] == 'topic=w':
                item['domain'] = 'World'
            if response.url.split('&')[-1] == 'topic=n':
                item['domain'] = 'U.S.'
            if response.url.split('&')[-1] == 'topic=b':
                item['domain'] = 'Business'
            if response.url.split('&')[-1] == 'topic=tc':
                item['domain'] = 'Technology'
            if response.url.split('&')[-1] == 'topic=e':
                item['domain'] = 'Entertainment'
            if response.url.split('&')[-1] ==  'topic=s':
                item['domain'] = 'Sports'
            if response.url.split('&')[-1] ==  'topic=snc':
                item['domain'] = 'Science'
            if response.url.split('&')[-1] ==  'topic=m':
                item['domain'] = 'Health'

            yield item
 def get_article_by_url(url):
     article = Article(url, fetch_images=False)
     article.download()
     if url == "empty":
         return "nolist"
     article.parse()
     return article.text
    def extract(self, item):
        """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
        parsing the HTML-Code.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """
        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name()

        article = Article('')
        article.set_html(item['spider_response'].body)
        article.parse()
        article_candidate.title = article.title
        article_candidate.description = article.meta_description
        article_candidate.text = article.text
        article_candidate.topimage = article.top_image
        article_candidate.author = article.authors
        if article.publish_date is not None:
            try:
                article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
            except ValueError as exception:
                self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
                              'Publishing date set to None' % item['url'])
        article_candidate.language = article.meta_lang

        return article_candidate
def insert_url(url):
    conn = sqlite3.connect('publico_news_sqllite3.db')
    cursor = conn.cursor()

    # get the article in plain text
    article = Article(url)
    article.download()
    article.parse()
    date = article.publish_date
    title = article.title
    text = article.text

    item = dict()
    item['datetime'] = date
    item['title'] = title
    item['text'] = text
    item['category'] = sys.argv[1].split('/')[6]
    item['link'] = sys.argv[1]
    item['origLink'] = sys.argv[1]

    print(item['category'])
    print(item['datetime'])

    if not duplicate(item, item['category'], cursor):
        status = insert_db(item, item['category'], cursor)
        if status == 1:
            print(sys.argv[1], "inserted")
        else:
            print("Error", status)
    else:
        print(url, "already in BD")

    conn.commit()
    conn.close()
 def test2(self):
     articles =[
      'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
      'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
      'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
      ]
     
     articles = [
      'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
      'http://www.bbc.co.uk/news/uk-wales-35954982',
      'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
      'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
      'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
      'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
      'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
      'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
      'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
     
     with open("./Output2.txt", "w") as text_file:
         for url in articles:
             print(url)
             a = Article(url)
             a.download()
             a.parse()
             text_file.write(a.text.encode('utf-8'))
             text_file.write('\n')
Exemple #25
0
def is_valid_article(link):
    print("Checking valid:\n" + link)

    if "cnn.com" not in link:
        return False
    if "html" not in link:
        return False
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    keywords = article.keywords

    matched = False

    for key in keywords:
        if key in nc_set:
            matched = True
    for key in keywords:
        if key in contorversial_set:
            matched = False

    if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
        main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
        visited_articles.write(link+"\n")
        return True

    return False
Exemple #26
0
def get_image():
  url = request.args.get('url', '')
  if not url:
    abort(400)

  if is_image(url):
    return redirect(url)

  article = Article(url)
  article.download()

  try:
    article.parse()
  except (IOError, UnicodeDecodeError):
    return '', 422

  try:
    top_image = article.top_image.rsplit('?',1)[0]
  except AttributeError:
    top_image = ''

  if not top_image == '':
    return redirect(top_image)
  else:
    return '', 422
Exemple #27
0
    def runTest(self):
        # The "correct" fulltext needs to be manually checked
        # we have 50 so far
        FULLTEXT_PREPARED = 50
        domain_counters = {}

        with open(URLS_FILE, 'r') as f:
            urls = [d.strip() for d in f.readlines() if d.strip()]

        for url in urls[:FULLTEXT_PREPARED]:
            domain = get_base_domain(url)
            if domain in domain_counters:
                domain_counters[domain] += 1
            else:
                domain_counters[domain] = 1

            res_filename = domain + str(domain_counters[domain])
            html = mock_resource_with(res_filename, 'html')
            try:
                a = Article(url)
                a.download(html)
                a.parse()
            except Exception:
                print('<< URL: %s parse ERROR >>' % url)
                traceback.print_exc()
                continue

            correct_text = mock_resource_with(res_filename, 'txt')
            condensed_url = url[:30] + ' ...'
            print('%s -- fulltext status: %s' %
                  (condensed_url, a.text == correct_text))
Exemple #28
0
def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"
Exemple #29
0
def get_details():
    url = request.args.get('url', '')
    if not url:
      abort(400)

    if is_image(url):
      result = {
        "url": url,
        "top_image": url,
        "text": "",
      }
      return jsonify(result)

    article = Article(url)
    article.download()

    try:
      article.parse()
    except (IOError, UnicodeDecodeError):
      return '', 422

    try:
      top_image = article.top_image.rsplit('?',1)[0]
    except AttributeError:
      top_image = ''

    result = {
      "url": url,
      "top_image": top_image,
      "text": article.text,
    }

    return jsonify(result)
Exemple #30
0
def get_article(url):
    a = Article(url)
    a.download()
    a.parse()

    article = dict()

    article['title'] = a.title
    article['publish_date'] = a.published_date
    article['authors'] = a.authors
    article['lead_image'] = a.top_image
    article['movies'] = a.movies
    article['text'] = a.text
    article['keywords'] = get_keywords(a.text)


    # This is more likely to fail.
    # try:
    #     article.nlp()
    #     article['summary'] = 'This summary is generated: \n ' + a.summary
    # except Exception:
    #     print Exception
    #     article['summary'] = a.summary

    return article
Exemple #31
0
 def analyse_web_page_article(self, url):
     article = Article(url)
     article.download()
     article.parse()
     return article, self.analyse(article.text)
Exemple #32
0
 def livearticles():
     bs_live = feedparser.parse("https://economictimes.indiatimes.com/rssfeedstopstories.cms")
     bs_compare = bs_live.entries[0].published
     hindu_live = feedparser.parse("https://indianexpress.com/section/world/feed/")
     hindu_compare = hindu_live.entries[0].published
     # bl_live = feedparser.parse("https://www.thehindubusinessline.com/feeder/default.rss")
     # bl_compare = bl_live.entries[0].published
     var = 1
     with open('NewsArticles.csv', 'a+', newline='', encoding="UTF-8") as file:
         writer = csv.writer(file)
         while var == 1:
             print("-----------------------------------------------------------------------------------------------------------")
             bs_live = feedparser.parse("https://economictimes.indiatimes.com/rssfeedstopstories.cms")
             for entry in bs_live.entries:
                 if entry.published > bs_compare:
                     url = entry.link
                     article = Article(url)
                     article.download()
                     article.parse()
                     article.nlp()
                     title = article.title
                     summary = article.summary
                     keywords = ', '.join(article.keywords)
                     date = entry.published
                     writer.writerow([title, date, keywords, summary, url])
                     print(entry.title)
                     print(entry.published)
                     print("the business-standard")
                     t = time.localtime()
                     current_time = time.strftime("%H:%M:%S", t)
                     print(current_time)
                 else:
                     bs_compare = bs_live.entries[0].published
                     break
             print("-----------------------------------------------------------------------------------------------------------")
             hindu_live = feedparser.parse("https://indianexpress.com/section/world/feed/")
             for entry in hindu_live.entries:
                 if entry.published > hindu_compare:
                     url = entry.link
                     article = Article(url)
                     article.download()
                     article.parse()
                     article.nlp()
                     title = article.title
                     summary = article.summary
                     keywords = ', '.join(article.keywords)
                     date = entry.published
                     writer.writerow([title, date, keywords, summary, url])
                     print(entry.title)
                     print(entry.published)
                     print("the hindu")
                     t = time.localtime()
                     current_time = time.strftime("%H:%M:%S", t)
                     print(current_time)
                 else:
                     hindu_compare = hindu_live.entries[0].published
                     break
             print("-----------------------------------------------------------------------------------------------------------")
             # bl_live = feedparser.parse("https://www.thehindubusinessline.com/feeder/default.rss")
             # for entry in bl_live.entries:
             #     if entry.published > bl_compare:
             #         url = entry.link
             #         article = Article(url)
             #         article.download()
             #         article.parse()
             #         article.nlp()
             #         title = article.title
             #         summary = article.summary
             #         keywords = ', '.join(article.keywords)
             #         date = entry.published
             #         writer.writerow([title, date, keywords, summary, url])
             #         print(entry.title)
             #         print(entry.published)
             #         print("the business line")
             #         t = time.localtime()
             #         current_time = time.strftime("%H:%M:%S", t)
             #         print(current_time)
             #     else:
             #         bl_compare = bl_live.entries[0].published
             #         break
             time.sleep(5)
Exemple #33
0
pip3 install newspaper3k
'''

from newspaper import Article

import sys

try:
    url = sys.argv[1]
except IndexError:
    print('[X] please enter url')
    sys.exit(0)

print('[*] url: %s' % url)

print('[F] ----- download & parse -----')
article = Article(url, language='zh')
article.download()
article.parse()

print('[*] authors: %s' % article.authors)
print('[*] publish_date: %s' % article.publish_date)
print('[*] text: %s' % article.text)
print('[*] top_image: %s' % article.top_image)
print('[*] movies: %s' % article.movies)

article.nlp()
print('[F] ----- article.nlp() -----')
print('[*] keywords: %s' % article.keywords)
print('[*] summary: %s' % article.summary)
def fetch_words_from_news(url, translate_TF=False):
    # return [[],[]] 0: en, 1:tr
    article = Article(url)
    article.download()
    article.parse()

    for_nltk = []
    news_text = article.text
    for_nltk.append(article.text)
    news_text = news_text.upper()
    news_text_wo_rn = news_text.replace('\n', ' ')
    news_text_wo_rn = news_text_wo_rn.replace('\r', ' ')
    news_text_list = news_text_wo_rn.split(' ')
    news_text_list = set(news_text_list)
    tokenized_sents = [word_tokenize(i) for i in for_nltk]

    # remove punctuations from list

    res = []
    new_res = []

    #s.translate(None, string.punctuation)

    #res = [s.translate(str.maketrans('', '', string.punctuation)) for s in tokenized_sents[0]

    for tixt in tokenized_sents[0]:
        new_tixt = ''.join(
            c.translate(str.maketrans('', '', string.punctuation + '“”'))
            for c in tixt if c not in string.punctuation + '“”')
        res.append(new_tixt)

    for d in res:
        if not d == '':
            new_res.append(d)

    capitalized_new_res = [KAP.upper() for KAP in new_res]

    capitalized_setted_new_res = set(capitalized_new_res)

    # delete one len item

    more_than_one_len_CSNR = []

    for e in capitalized_setted_new_res:
        if not len(e) < 2:
            more_than_one_len_CSNR.append(e)

    # delete numbers

    digitless_more_than_OLC = []

    for g in more_than_one_len_CSNR:
        if g.isalpha():
            digitless_more_than_OLC.append(g)

    tags_of_diggless = [nltk.pos_tag(f) for f in digitless_more_than_OLC]
    tags_of_diggless_2 = nltk.pos_tag(digitless_more_than_OLC)

    prepless_digitless_MTO = []

    for h in digitless_more_than_OLC:
        if not h.lower() in stop_words:
            prepless_digitless_MTO.append(h)

    if_word_in_cor_PDMTO = []
    TR_if_word_in_cor_PDMTO = []

    for g in prepless_digitless_MTO:
        if g.lower() in words.words():
            if_word_in_cor_PDMTO.append(g)
            transed = ""
            if not translate_TF == False:
                transed = translate.translate(g, 'en-tr')  # tr.set_text(g)
                transed = transed['text'][0]
            TR_if_word_in_cor_PDMTO.append(transed)  #  tr.translate()
            #yazi = transed['text'][0]+"\\r\\n"
            #with open('log.txt', 'a') as file:
            #    file.write(yazi)

    return [if_word_in_cor_PDMTO,
            TR_if_word_in_cor_PDMTO]  # return [[],[]] 0: en, 1:tr
def newspaperize(article_url):
    """Takes a string url that contains an article. Returns a Story object from 
    models.py containing information scraped from the article located at the url."""

    article = Article(article_url)  # create Article object

    print("Downloading:", article_url)

    try:  # returns None if url fails to download
        article.download()
    except:
        print("Failed to download url:", article_url)
        return None

    try:  # returns None if url cannot be parsed
        article.parse()
    except:
        print("Failed to parse url:", article_url)
        return None

    article.nlp()

    # variables to hold values for Story attributes
    headline = article.title
    imageurl = article.top_image
    timestamp = article.publish_date
    content = article.text
    keywords = article.keywords
    summary = article.summary
    description = article.meta_description
    clickbait = -1  # placeholder for clickbait label

    # populates keyword object with article.keywords
    list_of_keyword_obj = []
    for word in keywords:
        if word not in stopword:  # prevents stopwords from being keywords
            k = Keyword()
            k.keyword = word
            list_of_keyword_obj.append(k)

    s = Story()  # create Story object

    # set attributes
    s.name = headline
    s.imageurl = imageurl
    s.url = article_url
    current_time = datetime.datetime.now()

    if timestamp is not None:
        s.timestamp = timestamp.isoformat()
    else:  # generate timestamp if none found
        s.timestamp = current_time

    s.description = description
    s.keywords = list_of_keyword_obj
    s.summary = summary
    s.content = content
    s.clickbait = clickbait
    s.createtime = current_time

    return s
Exemple #36
0
    def listArticles(self):
        if (self.top_headlines["totalResults"] > 0):
            if (len(self.artList) != 0):
                self.artList = []
            if (self.lim > 0
                    and self.lim <= self.top_headlines['totalResults']):
                for i in range(self.lim):
                    art = self.top_headlines['articles'][i]

                    # Extract Text
                    new = Article(art['url'])
                    new.download()
                    new.parse()
                    storyText = "".join(
                        filter(lambda x: x in string.printable, new.text))
                    descr = "".join(
                        filter(lambda x: x in string.printable,
                               art['description']))
                    title = "".join(
                        filter(lambda x: x in string.printable, art['title']))

                    # Sentiment Analysis
                    document = types.Document(
                        content=storyText, type=enums.Document.Type.PLAIN_TEXT)
                    sentiment = client.analyze_sentiment(
                        document=document).document_sentiment

                    newStory = Story(url=art['url'],
                                     title=title,
                                     source=art['source']['name'],
                                     text=storyText,
                                     author=art['author'],
                                     imageURL=art['urlToImage'],
                                     date=art['publishedAt'][:10],
                                     des=descr,
                                     sent=sentiment.score,
                                     mag=sentiment.magnitude)
                    self.artList.append(newStory)
            else:
                for art in self.top_headlines["articles"]:
                    new = Article(art['url'])
                    new.download()
                    new.parse()
                    storyText = "".join(
                        filter(lambda x: x in string.printable, new.text))
                    descr = "".join(
                        filter(lambda x: x in string.printable,
                               art['description']))
                    title = "".join(
                        filter(lambda x: x in string.printable, art['title']))

                    # Sentiment Analysis
                    document = types.Document(
                        content=storyText, type=enums.Document.Type.PLAIN_TEXT)
                    sentiment = client.analyze_sentiment(
                        document=document).document_sentiment

                    newStory = Story(url=art['url'],
                                     title=title,
                                     source=art['source']['name'],
                                     text=storyText,
                                     author=art['author'],
                                     imageURL=art['urlToImage'],
                                     date=art['publishedAt'][:10],
                                     des=descr,
                                     sent=sentiment.score,
                                     mag=sentiment.magnitude)

                    self.artList.append(newStory)
        else:
            print("There were no articles with the query :", self.keyword)
Exemple #37
0
 }
 for entry in d.entries:
     # Check if publish date is provided, if no the article is skipped.
     # This is done to keep consistency in the data and to keep the script from crashing.
     if hasattr(entry, 'published'):
         if count > LIMIT:
             break
         article = {}
         article['link'] = entry.link
         date = entry.published_parsed
         article['published'] = datetime.fromtimestamp(
             mktime(date)).isoformat()
         try:
             content = Article(entry.link)
             content.download()
             content.parse()
         except Exception as e:
             # If the download for some reason fails (ex. 404) the script will continue downloading
             # the next article.
             print(e)
             print("continuing...")
             continue
         article['title'] = content.title
         article['text'] = content.text
         article['authors'] = content.authors
         article['top_image'] = content.top_image
         article['movies'] = content.movies
         newsPaper['articles'].append(article)
         articles_array.append(article)
         print(count, "articles downloaded from", company, ", url: ",
               entry.link)
Exemple #38
0
from newspaper import Article

# In[2]:

# Grabs the urls
url1 = 'https://www.washingtonpost.com/technology/2019/07/17/you-downloaded-faceapp-heres-what-youve-just-done-your-privacy/'
url2 = 'https://www.marketwatch.com/story/stock-markets-historic-bounce-may-signal-near-term-bottom-but-a-retest-of-the-low-like-1987-and-2008-is-still-a-possibility-2020-03-25?mod=home-page'
article1 = Article(url1)
article2 = Article(url2)

# In[5]:

# NLP, Natural language processesing
article1.download()
article2.download()
article1.parse()
article2.parse()
nltk.download('punkt')
article1.nlp()
article2.nlp()

# In[6]:

# Authors
article1.authors
article2.authors

# In[8]:

# publish date
article1.publish_date
Exemple #39
0
def getArticle(url=None, category=None):
    url = request.args.get('url')
    url_string = url.replace(':', '')

    try:
        ip = request.environ['REMOTE_ADDR']
        if request.headers.get('X-Forwarded-For'):
            ip = request.headers.get('X-Forwarded-For', ip)
        print(str.format("IP: {0}, Article: {1}", ip, url))
        print("ROUTE: " + request.access_route[-1])
    except:
        print("ERROR GETTING IP ADDRESS OR KEY")

    category = request.args.get('category')

    isHTML = False

    title = ""
    html = ""
    img = ""
    movies = []
    try:
        for key in r.keys(pattern="html:" + category + ":" + url_string):
            data = r.get(key)
            data = json.loads(data)
            isHTML = True
    except:
        print("Error fetching keys for article: " + url)

    if category:
        if isHTML:
            title = data['title']
            html = data['html']
            img = data['img']
            movies = data['movies']
            print("LOADED FROM DB")
        else:
            article = Article(url, keep_article_html=True)
            article.download()
            article.parse()
            title = article.title
            html = article.article_html
            img = article.top_image
            movies = article.movies
            print("CATEGROY: ", category)
            print("Title: ", article.title)
            r.set(
                'html:' + category + ":" + url_string,
                json.dumps({
                    "title": title,
                    "html": html,
                    "img": img,
                    "movies": movies
                }))

    return render_template("article.html",
                           url=url,
                           title=title,
                           body=Markup(html),
                           header_image=img,
                           video=movies)
Exemple #40
0
def find_similar_articles(news):
    news_article = Article(news)
    news_article.download()
    news_article.parse()
    news_title_tokenized = news_title_tokenization(
        preproccess_text(news_article.title))

    search_title = ""
    for word in news_title_tokenized:
        search_title = search_title + word + " "

    num_page_searched = 4
    search_results = google.search(search_title, num_page_searched)

    found_similar_article = 0
    for result in search_results:
        flag = 0
        search_result_title = result.name.split('http')[0]
        search_result_title = remove_unnecessary_noise(
            search_result_title.split('...')[0])
        search_result_title = preproccess_text(search_result_title)
        search_result_title = news_title_tokenization(search_result_title)

        result_string = ""
        for w in search_result_title:
            result_string = result_string + w + " "

        corpus = []
        corpus.append(search_title)
        corpus.append(result_string)

        vectorizer = CountVectorizer()
        features = vectorizer.fit_transform(corpus).todense()

        for f in features:
            dist = euclidean_distances(features[0], f)

        if dist < 1:
            found_similar_article = found_similar_article + 1

    news_article_text = preproccess_text(news_article.text)
    news_article_text = news_title_tokenization(news_article_text)

    article_result_string = ""
    for w in news_article_text:
        article_result_string = article_result_string + w + " "

    found_similar_article_body = 0
    search_result_link = search(search_title,
                                tld="com",
                                num=10,
                                stop=1,
                                pause=2)
    for link in search_result_link:
        check_news_article = Article(link)
        check_news_article.download()
        check_news_article.parse()

        check_news_article_text = preproccess_text(check_news_article.text)
        check_news_article_text = news_text_tokenization(
            check_news_article_text)

        check_article_result_string = ""
        for w in check_news_article_text:
            check_article_result_string = check_article_result_string + w + " "

        article_corpus = []
        article_corpus.append(article_result_string)
        article_corpus.append(check_article_result_string)

        article_vectorizer = CountVectorizer()
        article_features = article_vectorizer.fit_transform(
            article_corpus).todense()

        for f in article_features:
            article_dist = euclidean_distances(article_features[0], f)

        if article_dist < 0:
            found_similar_article = found_similar_article - 1

    if found_similar_article > 1:
        print('Found similar article titles!')
    elif found_similar_article == 1:
        print('Found a similar article title!')
    else:
        print('No similar article titles found!')
def fetch_news(news_cat):
    categories = {
        'TPS': 'top_stories_url',
        'ENT': 'entertainment_url',
        'BSN': 'business_url',
        'SPR': 'sports_url',
        'TCH': 'tech_url',
    }

    sources = models.NewsSourceModel.objects.values()
    news_links = []
    for source in sources:

        # Extracts news links from XML Feed of News website
        f = feedparser.parse(source[categories[news_cat]])
        MAX_LINKS = 5  # Max number of links to extract from each News Source
        for i in range(MAX_LINKS):

            try:
                # Parse links and extract Keywords and Summary using NLP
                article = Article(f['entries'][i]['link'])
                article.download()
                article.parse()
                article.nlp()

                news_links.append({
                    'url': f['entries'][i]['link'],
                    'keywords': article.keywords,
                    'summary': article.summary,
                    'news_source_id': source['id'],
                    'title': f['entries'][i]['title'],
                })
            except:
                continue

    # Now compare News links for duplicates
    pop_indexes = []
    for i in range(len(news_links)):
        list1 = news_links[i]['keywords']

        if i <= (len(news_links) - 2):
            remaining_list = news_links[i + 1:]
            for element in remaining_list:
                list2 = element['keywords']
                match_percentage = match_lists(list1, list2)

                if match_percentage >= 50:
                    pop_indexes.append(news_links.index(element))

    # Pop the duplicate elements
    for pop_index in pop_indexes:
        news_links.pop(pop_index)

    # Now store the links in the database
    q, _ = models.CategoryUrlsModel.objects.get_or_create(news_cat=news_cat)
    for news_link in news_links:
        q.urls.create(
            url=news_link['url'],
            news_source_id=news_link['news_source_id'],
            keywords=news_link['keywords'],
            summary=news_link['summary'],
            title=news_link['title'],
        )
    q.save()
Exemple #42
0
html = req.text
soup = BeautifulSoup(html, "lxml")

articles = []

list_default = soup.find('ul', {"class": "list_default"})
li = list_default.findAll('li')

for i in range(page_size - 1, page_size * 3):
    article = {}
    link = li[i].find('a')['href']

    a = Article(link, language="ko")
    a.download()
    a.parse()

    article["title"] = a.title
    article["content"] = a.text
    article["image"] = a.top_image
    article["company"] = '중앙일보'
    article["date"] = a.publish_date

    regex = re.compile("[가-힣]{2,5} (기자|인터넷 저널리스트)")
    author = regex.search(a.text)
    article["author"] = author.group()
    articles.append(article)

    print(article)

# In[ ]:
Exemple #43
0
def get_art_body(URL):
  a = Article(URL,language='ko')
  a.download()
  a.parse()
  return (a.title,a.text)
def getArticle(url):
    article = Article(url)
    article.download()
    article.parse()

    return article
def getTitle(url):
    article = Article(url)
    article.download()
    article.html
    article.parse()
    return article.title
Exemple #46
0
 def set_text(self):
     if not self.text and self.url:
         a = Article(self.url)
         a.download()
         a.parse()
         self.text = a.text
Exemple #47
0
def triggers(request):
    if request.method == 'POST':
        print(request.POST)
        data = dict(request.POST)
        # Driver Code
        key = 'show_details'
        one = checkKey(data, key)
        key = 'check_triggers'
        two = checkKey(data, key)
        key = 'show_wordcloud'
        three = checkKey(data, key)
        key = 'hate_speech'
        four = checkKey(data, key)
        print(one, two, three)
        #URL Link case
        if (one == True):
            url = data['Link'][0]
            print(url)
            article = Article(url)
            article.download()
            article.parse()
            authors = article.authors
            publishdate = article.publish_date
            #article.text
            article.nlp()
            keywords = article.keywords
            articlesummary = article.summary
            return render(
                request, 'consciousApp/triggers.html', {
                    'authors': authors,
                    'publishdate': publishdate,
                    'keywords': keywords,
                    'articlesummary': articlesummary
                })
        #Show triggers
        elif (two == True):
            text = request.POST['input_text'].lower()
            triggers = [
                "9 11", "9-11", "9/11", "ableism", "abusive", "ageism",
                "alcoholism", "animal abuse", "animal death",
                "animal violence", "bestiality", "gore", "corpse", "bully",
                "cannibal", "car accident", "child abuse", "childbirth",
                "classism", "death", "decapitation", "abuse", "drug", "heroin",
                "cocaine", "eating disorder", "anorexia", "binge eating",
                "bulimia", "fatphobia", "forced captivity", "holocaust",
                "hitler", "homophobia", "hostage", "incest", "kidnap",
                "murder", "nazi", "overdose", "pedophilia", "prostitution",
                "PTSD", "racism", "racist", "rape", "raping", "scarification",
                "self-harm", "self harm", "cutting", "sexism", "slavery",
                "slurs", "suicide", "suicidal", "swearing", "terminal illness",
                "terrorism", "torture", "transphobia", "violence", "warfare"
            ]
            tw = []
            text_file = open(
                './consciousApp/static/consciousApp/input/triggercheckdata.txt',
                'w+')
            text_file.write(str(text))
            text_file.close()
            for trigger in triggers:
                if text.find(trigger) > -1: tw.append(trigger)
            if tw == []: tw.append('No Triggers Found')
            return render(request, 'consciousApp/triggers.html', {
                'text': text,
                'triggers': tw,
                'data': data
            })
        #Show_cloud
        elif (three == True):
            text = request.POST['input_text'].lower()
            tokens = word_tokenize(text)
            textdata = nltk.Text(tokens)
            stopwords = set(STOPWORDS)
            wordcloud = WordCloud(stopwords=stopwords,
                                  max_font_size=50,
                                  max_words=100,
                                  background_color="white").generate(text)
            wordcloud.to_file(
                "./consciousApp/static/consciousApp/output/word-cloud.png")
            data = "./../../static/consciousApp/output/word-cloud.png"
            return render(request, 'consciousApp/triggers.html',
                          {'data': data})

        elif (four == True):
            sonar = Sonar()
            text = request.POST['input_text'].lower()
            url = data['Link'][0]
            data = sonar.ping(text=text)["classes"]
            hate_speech = data[0]
            hate_speech_confidence = hate_speech["confidence"] * 100
            offensive_language = data[1]
            offensive_language_confidence = offensive_language[
                "confidence"] * 100
            neither = data[2]
            neither_confidence = neither["confidence"] * 100
            print(type(data))
            print(offensive_language_confidence * 100,
                  hate_speech_confidence * 100, neither_confidence * 100)
            return render(
                request, 'consciousApp/triggers.html', {
                    'hate_speech_confidence': hate_speech_confidence,
                    'offensive_language_confidence':
                    offensive_language_confidence,
                    'neither_confidence': neither_confidence
                })
    else:

        return render(request, 'consciousApp/triggers.html')
Exemple #48
0
def scrap(link):
    article = Article(link)
    article.download()
    article.parse()
    return article.text
def HindustanTimesScrapper():

    SRC = KNOWN_NEWS_SOURCES["Hindustan Times"]

    data1 = get_chronological_headlines(SRC["pages"].format(1))
    data2 = get_trending_headlines(SRC["home"])
    text_lst = []
    url_lst = []
    date_lst = []
    title_lst = []
    try :
        for data in data1:
            if data["content"] == "NA":
                try:
                    article = Article(data["link"])
                    article.download()
                    article.parse()
                    article.nlp()
                    summary = article.text
                    text_lst.append(summary)
                except:
                    text_lst.append(data["content"])
            else:
                text_lst.append(data["content"])
            url_lst.append(data["link"])
            date = data["published_at"]
            if(date == None) :
                date = datetime.now()
            date_lst.append(date)
            try :
                title_lst.append(data["title"])
            except:
                title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0])
        for data in data2:
            if data["content"] == "NA":
                try:
                    article = Article(data["link"])
                    article.download()
                    article.parse()
                    article.nlp()
                    summary = article.text
                    text_lst.append(summary)
                except:
                    text_lst.append(data["content"])
            else:
                text_lst.append(data["content"])
            url_lst.append(data["link"])
            date = data["published_at"]
            if(date == None) :
                date = datetime.now()
            date_lst.append(date)
            try :
                title_lst.append(data["title"])
            except:
                title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0])

        df_raw = pd.DataFrame(list(zip(text_lst, url_lst, date_lst, title_lst)), columns=["text", "url", "date", "headline"])

        df_crime = get_crime(df_raw)
        data = get_data("./database/data.json")
        df = get_location(df_crime, data)
        df = preprocessing2(df, data)
        return df.reset_index(drop=True)
    except :
        return pd.DataFrame(columns=["index","text","url","crime","location","region","city","date","headline"])
Exemple #50
0
# Import the libraries
from newspaper import Article
import nltk
from gtts import gTTS
import os

# Get the article
article = Article('https://www.poetryfoundation.org/poems/46945/baa-baa-black-sheep')

article.download()  # Download the article
article.parse()  # Parse the article
nltk.download('punkt')  # Download the 'punkt' package
article.nlp()  # Apply Natural Language Processing (NLP)

# Get the articles text
mytext = article.text

# Print the text
print(mytext)

# Language in which you want to convert
# language = 'pt-br' #Portuguese (Brazil)
language = 'en'  # English

# Passing the text and language to the engine,
# here we have marked slow=False. Which tells
# the module that the converted audio should
# have a high speed
myobj = gTTS(text=mytext, lang=language, slow=False)

# Saving the converted audio in a mp3 file named
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
import yaml
import pyaudio
import speech_recognition as sr

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
### website from where we want to extract the data
article1 = Article('https://en.wikipedia.org/wiki/Coronavirus')
article1.download()
article1.parse()
article1.nlp()

article2 = Article('https://www.euro.who.int/en/health-topics/noncommunicable-diseases/mental-health/data-and-resources/mental-health-and-covid-19')
article2.download()
article2.parse()
article2.nlp()

article3 = Article('https://www.healthline.com/health-news/what-covid-19-is-doing-to-our-mental-health')
article3.download()
article3.parse()
article3.nlp()

article4 = Article('https://www.webmd.com/lung/coronavirus')
article4.download()
article4.parse()
Exemple #52
0
from newspaper import Article

# A new article from TOI
url = "http://world.people.com.cn/n1/2019/0308/c1002-30964972.html"

# For different language newspaper refer above table
toi_article = Article(url, language='zh')  # zh for China

# To download the article
toi_article.download()

# To parse the article
toi_article.parse()

# To perform natural language processing ie..nlp
# toi_article.nlp()

# To extract title
print("Article's Title:")
print(toi_article.title)
print("*" * 80)

# To extract text
print("Article's Text:")
print(toi_article.text)
print("*" * 80)

# To extract summary
print("Article's Summary:")
print(toi_article.summary)
print("*" * 80)
Exemple #53
0
import spacy
from newspaper import Article
nlp = spacy.load("en_core_web_sm")

url_1 = 'https://www.wsj.com/articles/u-s-officials-walk-out-of-meeting-at-presidential-palace-in-kabul-11553628051'
url_2 = 'https://www.wsj.com/articles/iran-moves-to-cement-its-influence-in-syria-11553632926'
article_1 = Article(url_1)
article_2 = Article(url_2)
article_1.download()
article_2.download()
article_1.parse()
article_2.parse()

article_stream = [article_1.text, article_2.text]

for doc in nlp.pipe(article_stream, batch_size=50):
    print(doc.vocab)

# for entity in doc.ents:
#     print(entity.text, entity.start_char, entity.end_char, entity.label_)
Exemple #54
0
    def crawling(self, category_name):
        # Multi Process PID
        print(category_name + " PID: " + str(os.getpid()))    

        writer = Writer(category_name=category_name, date=self.date)
        # 기사 URL 형식
        if (category_name == "연합뉴스속보"):
            url = "http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&sid1=001&sid2=140&oid=001&isYeonhapFlash=Y" \
                  + "&date="

        else:
            url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
                self.categories.get(category_name)) + "&date="

        # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
        day_urls = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])
        print(category_name + " Urls are generated")
        print("The crawler starts")

        for URL in day_urls:
            print(URL)
            regex = re.compile("date=(\d+)")
            news_date = regex.findall(URL)[0]

            request = self.get_url_data(URL)
            document = BeautifulSoup(request.content, 'html.parser')
            
            # html - newsflash_body - type06_headline, type06
            # 각 페이지에 있는 기사들 가져오기
            if (category_name == "연합뉴스속보"):
                post_temp = document.select('.newsflash_body .type02 li ')

            else:
                post_temp = document.select('.newsflash_body .type06_headline li dl')
                post_temp.extend(document.select('.newsflash_body .type06 li dl'))
           
            # 각 페이지에 있는 기사들의 url 저장
            post = []
            headlines = []
            companys = []

            
            for line in post_temp:
                post.append(line.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음
                try:
                    companys.append(line.find('span', class_="writing").text)
                except:
                    companys.append("err")
                try:
                    h = line.find_all('a')
                    if len(h) > 1:
                        headlines.append(h[1].text)
                    elif len(h) == 1:
                        headlines.append(h[0].text)
                    else:
                        headlines.append("err")
                except:
                    headlines.append("err")
            del post_temp
        
            
            print(len(post))

            for i in range(len(post)):  # 기사 URL
                # 크롤링 대기 시간
                print(i)
                sleep(0.01)
                content_url = post[i]
                
                # 기사 HTML 가져옴
                try:
                    article = Article(content_url, language='ko')
                    article.download()
                    article.parse()
                    text_sentence = article.text.strip()
                    text_company = companys[i]
                    text_headline = headlines[i].strip()
        ######################################################################
                    if self.keyword == 'initvalue':
                        wcsv = writer.get_writer_csv()
                        wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
                    else:
                        headline_to_words = text_headline.split()
                        if headline_to_words.index(self.keyword) >= 0:
                            wcsv = writer.get_writer_csv()
                            wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])
        ######################################################################

                            
                except Exception as err:
                    print(err)
        
        writer.close()
        return        
Exemple #55
0
def align_row_text():
    source_text = request.form['source_text']
    target_text = request.form['target_text']
    # check if source and target are urls
    url_rex = r"(?i)\b(?:(?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\))+(?:\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    if re.fullmatch(url_rex, source_text.strip().lower()):
        src_article = Article(source_text.strip())
        src_article.download()
        src_article.parse()
        source_text = src_article.title + "\n" + src_article.text
    if re.fullmatch(url_rex, target_text.strip().lower()):
        tar_article = Article(target_text.strip())
        tar_article.download()
        tar_article.parse()
        target_text = tar_article.title + "\n" + tar_article.text

    # segment source and target

    src_lang_code = lang_detect.detect(source_text)
    tar_lang_code = lang_detect.detect(target_text)

    if src_lang_code == 'zh-cn':
        srx_src_code = 'Generic'
    else:
        srx_src_code = src_lang_code

    if tar_lang_code == 'zh-cn':
        srx_tar_code = 'Generic'
    else:
        srx_tar_code = tar_lang_code

    srx_rules = srx_segmenter.parse(srx_file_path)
    seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_src_code],
                                             source_text)
    source_list = seg_results.extract()[0]
    seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_tar_code],
                                             target_text)
    target_list = seg_results.extract()[0]
    # translate target
    target_mt_list = mt_helpers.google_translate_chunk_by_chunk(
        target_list, tar_lang_code, src_lang_code)
    # align
    # initiate the alignment class
    algorithm = request.form.get('algorithm', 'fuzzy')
    align_options = {
        "location_weight":
        float(request.form.get('input_location_weight', 0.2)),
        "length_weight":
        float(request.form.get('input_length_weight', 0.1)),
        "meta_weight":
        float(request.form.get('input_length_weight', 0.1)),
        "semantic_weight":
        float(request.form.get('input_semantic_weight', 0.6)),
        "search_range":
        float(request.form.get('input_paragraph_size', 5)),
        "minimum_semantic_score":
        float(request.form.get('input_minimum_semantic_score', 0.5)),
        "minimum_partial_sem_match":
        0.1,
        "minimum_length_score":
        float(request.form.get('input_minimum_length_score', 0.6))
    }

    if algorithm == 'fuzzy':
        semantic_class = fuzzy_comp.FuzzyComp
    else:
        semantic_class = tfidf_scikit.TfidfComp
    alg = TranslationAligner()

    alg.align(semantic_class,
              source_list,
              target_list, [],
              target_mt_list,
              options=align_options)
    # save json file to a random file name under static files and return it with the results
    temp_file_name = ''.join(
        random.choices(string.ascii_uppercase + string.digits, k=10))
    temp_json_file_name = temp_file_name + ".json"
    alg.export_json_dict(os.path.join(export_path, temp_json_file_name))
    del alg
    return {"json_file_name": temp_json_file_name}
Exemple #56
0
def getData():
    url = request.args.get('url')
    # From Newspaper Framework getting required data
    content = Article(url)
    content.download()
    content.parse()
    title = content.title
    rawText = content.text
    # Unformatted Data to show to user
    textDisplay = rawText.split("\n\n")
    textDisplay = ''.join(textDisplay)
    # Converting numbered text to digits
    t2d = text2digits.Text2Digits()
    numText = t2d.convert(rawText)
    text = numText.split("\n\n")
    text = ''.join(text)
    # Implemented API data limit restriction
    if len(text) < 5000:
        text = text
    else:
        text = text[:5000]
    jsonData = {"text": text}
    configDataResource = os.path.join(SITE_ROOT, "data", "configdata.json")
    configData = json.load(open(configDataResource))

    # NER API call request
    headers = {
        'x-api-key': configData["X_API_KEY"],
        'Content-type': 'application/json'
    }
    ner_response = requests.post(
        configData["NAMED_ENTITY_RECOGNITION_ENDPOINT"],
        headers=headers,
        data=json.dumps(jsonData))
    # print(ner_response.text)
    # Deserializing the response
    places = lambda: None
    places.__dict__ = json.loads(ner_response.text)
    print(places.LOC)

    json_url = os.path.join(SITE_ROOT, "data", "sg-citi.json")
    data = json.load(open(json_url))

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    LOC = []
    CASE = []
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        if ent.label_ == "CARDINAL":
            CASE.append(ent.text)
        if ent.label_ == "GPE":
            LOC.append(ent.text)

    count = []
    for i in CASE:
        if i.isdigit():
            if i not in count:
                count.append(i)
    print("COUNT: ", count)
    if not len(count):
        count = list(i for i in range(80, 500, 7))
    returnJson = {
        "text": textDisplay,
        "location": [],
        "category": ner_response.text
    }
    for i in places.LOC:
        for citi in data:
            if i in citi["name"] and citi["name"] not in returnJson["location"]:
                returnJson["location"].append({
                    "name":
                    citi["name"],
                    "lat":
                    "no1",
                    "lon":
                    "no2",
                    "count":
                    count[random.randrange(0, len(count))]
                })
                break
    print(returnJson)
    return jsonify(returnJson)
Exemple #57
0
def get_article(url):
    article = Article(url, language='en')
    article.download()
    article.parse()
    return article
    def parse_content(self, response):
        #这个函数用作新闻的具体解析

        ID = 'songtengteng'

        website_name = '商务部贸易救济调查局'

        # 网站板块
        website_block = response.xpath(
            "//div[@class='position']/a[2]/text()").extract_first()

        news_url = response.meta['url']

        # 作者
        news_author_list = response.xpath('//script')
        if len(news_author_list) != 0:
            news_author = news_author_list.re(
                'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"')[0][13:].replace(
                    '"', '')
        else:
            news_author = '商务部贸易救济调查局'

        # 新闻发布时间,统一格式:YYYY MM DD HH:Mi:SS           v.{2}\stm\s=\s\".*\"
        publish_time = response.meta['publish_time']
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-8:]
        publish_time = year + month + day + ' ' + juti_time

        # 新闻自带标签
        news_tags = response.xpath('//script').re(
            'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '')

        # 新闻标题
        news_title = response.xpath('//h3/text()').extract_first()

        # 新闻正文
        a = Article(response.url, language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        #获取文章的图片和名称
        image_urls = []
        image_names = []
        image_urls1 = response.xpath(
            '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src'
        ).extract()
        if image_urls1 != []:
            image_urls = image_urls1
            for i in range(len(image_urls)):
                if i < 10 and i >= 0:
                    image_name = news_title + '_000' + str(i)
                    image_names.append(image_name)
                elif i < 100 and i >= 10:
                    image_name = news_title + '_00' + str(i)
                    image_names.append(image_name)
                elif i < 1000 and i >= 100:
                    image_name = news_title + '_0' + str(i)
                    image_names.append(image_name)
                else:
                    image_name = news_title + str(i)
                    image_names.append(image_name)

        yield self.getItem(
            id=ID,
            news_url=news_url,
            website_name=website_name,
            website_block=website_block,
            news_title=news_title,
            publish_time=publish_time,
            news_author=news_author,
            news_tags=news_tags,
            news_content=news_content,
            image_urls=image_urls,
            image_names=image_names,
        )
def scrape_analyze(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text
Exemple #60
0
    def parse_artical(self, response):  # 具体文章解析
        ID = 'songtengteng'

        # 新闻链接
        news_url = response.meta['url']

        # 新闻标题
        news_title = response.xpath('//h1/text()').extract_first()

        # 作者
        a = response.xpath(
            '//div[@class="info-source"]/span/a/text()').extract_first()
        if a == None:
            news_author = ''
        else:
            news_author = a

        # 发布时间
        publish_time = response.xpath(
            '//div[@class="info-source"]/span[2]/text()').extract_first()
        year = publish_time[0:4]
        month = publish_time[5:7]
        day = publish_time[8:10]
        juti_time = publish_time[-5:]
        publish_time = year + month + day + ' ' + juti_time + ':' + '00'

        # 正文
        '''可以考虑下使用文章密度算法来快速解析文章正文'''
        a = Article(response.meta['url'], language='zh')  # Chinese
        a.download()
        a.parse()
        news_content = a.text

        # 标签
        news_tags = ''

        #图片
        image_urls1 = response.xpath('//p[@class="pi"]/img/@src').extract()
        image_urls = []
        image_names = []
        if image_urls1 != []:
            for i in range(len(image_urls1)):
                image_url = image_urls1[i]
                image_urls.append(image_url)
                if i >= 0 and i < 10:
                    image_title = news_title + '000' + str(i)
                elif i >= 10 and i < 100:
                    image_title = news_title + '00' + str(i)
                elif i >= 100 and i < 1000:
                    image_title = news_title + '0' + str(i)
                else:
                    image_title = news_title + str(i)
                image_names.append(image_title)

        yield self.getItem(id=ID,
                           news_url=news_url,
                           website_name='搜狐焦点',
                           website_block='访谈',
                           news_title=news_title,
                           publish_time=publish_time,
                           news_author=news_author,
                           news_tags=news_tags,
                           news_content=news_content,
                           image_urls=image_urls,
                           image_names=image_names)