Exemple #1
0
def get_news():
    urls = get_urls()
    news = News.query.with_entities(News.source_url).all()

    used_urls = []
    for n in news:
        used_urls.append(n[0])

    for url in urls:
        if not url in used_urls:
            used_urls.append(url)

            article = Article(url, language='pt', keep_article_html=True)
            article.download()
            article.parse()
            article.nlp()

            news_article = News(url)
            news_article.slug = slugify(article.title)
            news_article.title = article.title
            news_article.text = article.text
            news_article.top_image = article.top_image
            news_article.summary = article.summary
            news_article.article_html = article.article_html
            news_article.created_at = datetime.datetime.now()

            exists_this_news = News.query.filter_by(source_url=url).first()

            if not exists_this_news:
                print(url)
                db.session.add(news_article)
                db.session.commit()
Exemple #2
0
def extract():
  url = sys.argv[1:].pop()

  a = Article(url, keep_article_html=True)
  a.download()
  a.parse()
  a.nlp()

  parsed_uri = urlparse(a.source_url)
  domain = '{uri.netloc}'.format(uri=parsed_uri)

  try:
    publish_date = a.publish_date.strftime('%Y-%m-%d %H:%M')
  except AttributeError:
    publish_date = ""

  try:
    authors = ", ".join(a.authors)
  except AttributeError:
    authors = ""

  result = {}
  result['html'] = a.html
  result['body'] = a.text
  result['title'] = a.title
  result['top_image'] = a.top_image
  result['author'] = authors
  result['html_body'] = a.article_html
  result['favicon'] = a.meta_favicon
  result['description'] = a.summary
  result['publish_date'] = publish_date
  result['keywords'] = a.keywords
  result['sitename'] = re.sub(r"^www.", "", domain)

  return json.dumps(result).encode('utf-8')
Exemple #3
0
def summarise_one(url, title=True, keywords=True, summary=False, \
    top_img_src=False):
    '''
    Get url and return summary 
    '''
    article = Article(url)

    # configuration for Newspaper to minimize processing time
    configure = Config()
    configure.fetch_images = False
    configure.MAX_SUMMARY = 300
    configure.MAX_SUMMARY_SENT = 3
    
    try:
        article.download()
        article.parse()
    except:
        print(url) 

    title = article.title
    if keywords or summary:
        try:
            article.nlp()
            if keywords:
                keywords = article.keywords
            if summary:
                summary = article.summary
        except :
            print('NEwspaper error with nlp() call')
        
    if top_img_src:
        top_img_src = article.top_image
   
    return title, keywords, summary, top_img_src
Exemple #4
0
def is_valid_article(link):
    print("Checking valid:\n" + link)

    if "cnn.com" not in link:
        return False
    if "html" not in link:
        return False
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    keywords = article.keywords

    matched = False

    for key in keywords:
        if key in nc_set:
            matched = True
    for key in keywords:
        if key in contorversial_set:
            matched = False

    if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
        main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
        visited_articles.write(link+"\n")
        return True

    return False
Exemple #5
0
def show_article():
    url_to_clean = request.args.get('url_to_clean')
    if not url_to_clean:
        return redirect(url_for('index'))

    article = Article(url_to_clean)
    article.download()
    article.parse()

    try:
      html_string = ElementTree.tostring(article.clean_top_node)
    except:
      html_string = "Error converting html to string."

    try:
      article.nlp()
    except:
      log.error("Couldn't process with NLP")

    a = {
          'html': html_string, 
         'authors': str(', '.join(article.authors)), 
         'title': article.title,
         'text': article.text,
         'top_image': article.top_image,
         'videos': str(', '.join(article.movies)),
         'keywords': str(', '.join(article.keywords)),
         'summary': article.summary
         }
    return render_template('article/index.html', article=a, url=url_to_clean)
    
Exemple #6
0
def post_new(request):
    if request.method == "POST":
        form = PostForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            post.author = request.user
            post.published_date = timezone.now()
            post.save()
            return redirect('blog.views.post_detail', pk=post.pk)
    elif request.method == 'GET':
        url = request.GET.get('url', '')
               
        if len(url) > 5:
            article = Article(url, language='en')
            article.download()
            article.parse()
            article.nlp()
            image = article.top_image
            summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
            title = article.title.replace(u'\u2019',"\'")
            source = url.split('//')[1].split('/')[0].replace('www.','')
            status = 'UD'
            form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) 
        else:
            form = PostForm() 

    return render(request, 'blog/post_edit.html', {'form': form})
Exemple #7
0
def get_nlp_data(url):
	article = Article(url)
	article.download()
	article.parse()
	article.nlp()
	
	return json.dumps(article.keywords)
def get_document_json(url):
    """
    Parameters
    -------------
    url: str
        url of the document to be parsed.
    Returns
    -------------
    dict: document data.
    """
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()
    if article.publish_date is None or isinstance(article.publish_date, str):
        date = None
    else:
        date = article.publish_date.strftime('%Y-%m-%d')
    if article.meta_lang != None and article.meta_lang != '':
        stopwords = safe_get_stop_words(article.meta_lang)
        keywords = [i for i in article.keywords if i not in stopwords]
    else:
        keywords = article.keywords
    keywords = list(set([slugify(i) for i in keywords]))
    json = {
        'title': article.title,
        'authors': article.authors,
        'created_on': date,
        'language': article.meta_lang,
        'keywords': keywords,
        'url': url,
    }
    return json
def extract_summary(article_url):
	#article_url = raw_input('Please enter the url of the newsarticle \n')
	article_obj = Article(article_url)
	article_obj.download() 
	article_obj.parse() 
	article_obj.nlp() 
	article_summary = article_obj.summary
	return article_summary
def extract_keywords(article_url):
	#article_url = raw_input('Please enter the url of the newsarticle \n')
	article_obj = Article(article_url)
	article_obj.download() 
	article_obj.parse() 
	article_obj.nlp() 
	article_keywords = article_obj.keywords
	return article_keywords
def main():
    source="The Guardian"
    #config = Config()
    #config.memoize_articles = False
    guardian = Source("http://www.theguardian.com/world", memoize_articles=False)
    guardian.build()
    #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
    #news_pool.set([guardian], threads_per_source=2)
    #news_pool.join()

    #print(guardian.size())

    for article in [x for x in guardian.articles if re.match(".*/world/.*", x.url) is not  None]:
        url = article.url
        a = Article(url, language='en')
        a.download()
        for i in range(10):
            if a.is_downloaded:
                break
            else:
                a.download()
        try:
            a.parse()
            a.nlp()
        except:
            print("Error: Not parsed/downloaded correctly.")
            continue

        html = a.html
        summary = a.summary
        keywords = a.keywords
        title = a.title
        text = a.text
        date = str(a.publish_date).split()[0].split("-")
        date[0], date[1], date[2] = date[1], date[2], date[0]
        date = "/".join(date)
        delta = re.search(r'<span class="content__dateline-time">(.*)</span>' , html).group(1).replace(".",":").split()[0]
	time = datetime.now() + timedelta(hours=delta )
        date_time = date + " " + time
        #print(title)
        #print(date_time)
        date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
        #print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
        #TODO: Add stuff to the DB

        try:
            article = {
                'headline': title,
                'url': url,
                'text': text,
                'date': date_obj
            }
            newspaper_article('The Guardian', article, keywords=keywords)
        except Exception as ex:
            print 'Article could not be created due to following error'
            print ex
def get_article_info(memento_url, dt, uri_id, base_dir):
    print memento_url    
    article = Article(memento_url)
    html = get_uri_offline_data(dt, uri_id, "html", base_dir)
    article.download(html)
    article.parse()
    text = get_uri_offline_data(dt, uri_id, "txt", base_dir)
    if text != None:
        article.text = text
    article.nlp()
    return article
Exemple #13
0
def parseURL(url):
    a = Article(url)
    try:
        a.download()
        a.parse()
        a.nlp()
        authors = a.authors
        keywords = a.keywords
        del(a)
        return (authors, keywords)
    except:
        return (None, None)
Exemple #14
0
def fetch_article(url):
  print 'fetch '+url

  a = Article(url=url, keep_article_html=True)
  a.download()

  try:
    a.parse()
  except Exception:
    exc = traceback.format_exc()
    print "Parse error: " + exc

  # newspaper gives us some news stuff
  text = a.article_html
  title = a.title
  image = a.top_image
  movies = a.movies
  authors = a.authors

  article_data = {
    "url": url,
    "title": title,
    "text": text
  }

  if authors:
    article_data["author"] = {
      "name": authors[0]
    }

  # media
  if movies:
    article_data["media"] = {
      "url": movies[0],
      "type": "video"
    }
  elif image:
    article_data["media"] = {
      "url": image,
      "type": "image"
    }

  try:
    a.nlp();
  except Exception:
    exc = traceback.format_exc()
    print "NLP error: " + exc

  if a.summary:
    article_data["summary"] = a.summary

  return article_data
Exemple #15
0
def quick_analyse(url):
    fields = ['authors', 'publish_date', 'top_image', 'movies', 'keywords', 'summary']

    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    print(len(article.html))
    for f in fields:
        print(f + ': ' + str(getattr(article, f)))

    return article
Exemple #16
0
def read_newspaper():
    url = request.args.get('url', '')
    if url:
        a = Article(url, image_dimension_ration=3, keep_article_html=True)
        a.download()
        a.parse()
        a.nlp()
        json_string = json.dumps(
            dict(top_image=a.top_image, text=a.article_html, title=a.title, summary=a.summary, images=a.images,
                 movies=a.movies),
            ensure_ascii=False,
            indent=None if request.is_xhr else 2)
        return Response(json_string, mimetype='application/json')
    return Response()
Exemple #17
0
def fetch_article(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    print(article.keywords)

    with app.app_context():
        entity = ArticleEntity.create(
            publish_date=article.publish_date,
            title=article.title,
            text=article.text,
        )
Exemple #18
0
def whoHasTimeToRead(url):
	is_article = valid_url(url, verbose=True)
	config = Config()
	config.MAX_KEYWORDS = 10
	if is_article:
		sumitup = {}
		b = Article(url=url,config=config)
		b.download()
		b.parse()
		b.nlp()
		sumNews = summary(b.title, b.text, b.keywords)
		sumTitle = b.title
		movies = b.movies[0] if len(b.movies) > 0 else "None"
		return sumNews,sumTitle,movies
	return "Nope"
def getrelevance(url,keywords):
	a=Article(url)
	a.download()
	a.parse()
	a.nlp()
	print a.title
	print a.summary
	l1=a.keywords
	no=getsimilar(l1,keywords)
	print keywords
	print l1
	print "Similar words: "
	print no
	print len(keywords)
	print "Match: "
	print float(no)/(len(keywords))
def train(urls,keywords):
	for url in urls:
		a=Article(url)
		a.download()
		a.parse()
		a.nlp()
		print a.title
		print a.summary
		print a.keywords
		l1=a.keywords
		l1_1=[]
		for word in l1:
			l1_1.append(unicodedata.normalize('NFKD', word).encode('ascii','ignore'))
		diff=difference(l1_1,keywords)
		keywords.extend(diff)
	return keywords
Exemple #21
0
    def __init__(self, url):
        c = Config()
        c.keep_article_html = True

        article = Article(url=url, config=c)
        article.download()
        article.parse()
        
        try:
            article.nlp()
            summary = article.summary
            if summary == "":
                self.summary = "Summary not available!"
            else:
                self.summary = summary
        except Exception, e:
            self.summary = "Summary not available!"
 def _parse_article(self, key, url):
     a = Article('')
     html = Google().cache(url)
     a.set_html(html)
     a.parse()
     a.nlp()
     article = {"summary":a.summary,
               "publish_date":a.publish_date,
               "images":a.images,
               "top_image":a.top_image,
               "title":a.title,
               "authors":a.authors,
               "keywords":a.keywords,
               "text":a.text}
     # update
     #conn = r.connect(db="clearspark")
     conn = r.connect(**rethink_conn.conn())
Exemple #23
0
def home(url):

    data = {}
    data['url'] = url

    # Validate url
    if urlparse.urlparse(url).scheme not in ('http', 'https'):
        data['error'] = 'Invalid URL'
        return json.dumps(data)

    a = Article(url)
    a.download()
    a.parse()

    data['title'] = a.title
    data['authors'] = a.authors
    data['text'] = a.text

    try:
        a.nlp()
    except UnicodeDecodeError:
        # Strip non-ascii characters
        a.title = to_ascii(a.title)
        a.text = to_ascii(a.text)
        a.nlp()

    # NLP
    data['summary'] = a.summary
    data['keywords'] = a.keywords
    data['tags'] = list(a.tags)

    # Media
    data['top_image'] = a.top_image
    data['images'] = a.images
    data['movies'] = a.movies

    # Meta
    data['source_url'] = a.source_url
    data['published_date'] = a.published_date

    data['meta_img'] = a.meta_img
    data['meta_keywords'] = a.meta_keywords
    data['meta_lang'] = a.meta_lang

    return json.dumps(data)
Exemple #24
0
def scrap(index):
    base_url = "https://www.google.co.in/search?q=chennai%20accidents&tbm=nws&start=" + str(index)
    web_page = requests.get(base_url)
    parsed_content = PyQuery(web_page.text)
    all_crimes = parsed_content("a")
    for crime in all_crimes:
        crime_url = crime.attrib["href"]
        if "/url?q=" in crime_url:
            try:
                article = Article((crime_url.split(start))[1].split(end)[0])
                article.download()
                article.parse()
                article.nlp()
                keywords = article.keywords
                area_name = findLocation(keywords)
                final1.append(area_name)
            except Exception:
                pass
def parse_article(url):
    article = Article(url)
    article.download()
    try:
        article.parse()
        article.nlp()
    except ArticleException:
        # TODO: log the error
        return None
    else:
        return {
            'url': article.url,
            'title': article.title,
            'keywords': article.keywords,
            'summary': article.summary,
            'images': article.images,
            'movies': article.movies
        }
def fetch_data(bbc):
	bbc.build()
	for article in [x for x in bbc.articles]:
        	url = article.url
        	a = Article(url, language='en')
        	a.download()
        	for i in range(10):
            		if a.is_downloaded:
                		break
            		else:
                		a.download()
        	try:
            		a.parse()
            		a.nlp()
        	except:
            		print("Error: Not parsed/downloaded correctly.")
            		continue

       		a.parse()
        	a.nlp()	
        	html = a.html
        	summary = a.summary
        	keywords = a.keywords
        	title = a.title
		print title
        	text = a.text
        	#date = str(a.publish_date).split()[0].split("-")
        	#date[0], date[1], date[2] = date[1], date[2], date[0]
        	#date = "/".join(date)
        	#time = re.search(r'<span class="date date--v2 relative-time">(.*)<\/span>' , html).group(1).replace(".",":").split()[0]
		#bbc does not have a time div in html
		date_time = datetime.now()	
        	
		try:
                    article = {
                        'headline': title,
                        'url': url,
                        'text': text,
                        'date': date_time
                    }
                    newspaper_article('BBC', article, keywords=keywords)
                except Exception as ex:
                    print 'Article could not be created due to following error'
                    print ex
def Test(topic, url):
    print("Inside hello method")
    print topic
    # url = 'http://www.theguardian.com/technology/live/2015/mar/09/apple-watch-macbook-launch-event-smartwatch-spring-forward'
    # url = 'http:~~www.theguardian.com~technology~live~2015~mar~09~apple-watch-macbook-launch-event-smartwatch-spring-forward'
    url = url.replace('~', '/')
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    article.nlp()
    art_keywords = article.keywords
    art_summary = article.summary
    # print text
    print art_summary
    # print art_keywords
    get_tweets(topic, art_keywords)
    # return ''.join(art_keywords)
    return json.dumps(art_keywords)
Exemple #28
0
def get_article_array(url_array):
    arr = []
    for url in url_array:
        try:
            response = session.get(url, timeout=10)
            print("RESPONSE CODE: " + str(response.status_code))

            if response.ok:
                article = Article(url)
                article.download()
                article.parse()
                article.nlp()
                text = article.text
                arr.append(strip_unicode(text))
            else:
                print "error"
        except requests.HTTPError or requests.ConnectionError as e:
            print(e)
    return arr
Exemple #29
0
 def post(self, request, *args, **kwargs):
     url = request.POST.get("url")
     context = {}
     a = Article(url, language='en')
     a.download()
     a.parse()
     context["title"] = a.title
     context["text"] = a.text
     context["authors"] = ", ".join(a.authors)
     context["top_image"] = a.top_image
     a.fetch_images()
     context["images"] = a.images
     context["publish_date"] = a.publish_date
     context["movies"] = a.movies
     a.nlp()
     context["keywords"] = ", ".join(a.keywords)
     context["summary"] = a.summary
     context["url"] = url
     context["method"] = "post"
     return render(request, self.template_name, context)
Exemple #30
0
    def set_slyp(self):
        a = Article(self.raw_url)
        a.download()
        a.parse()
        a.nlp()

        self.url = a.url.split('?')[0] if 'youtube' not in a.site_name else a.url
        self.raw_url = a.url
        self.slyp_type = 'video' if a.is_video() else 'article'
        self.title = a.title
        self.author = a.author
        self.date = a.publish_date
        self.text = a.text
        self.summary = a.summary
        self.description = a.description
        self.top_image = a.top_image
        self.site_name = a.site_name
        self.has_video = a.has_video()
        self.video_url = a.video_url
        self.keywords = a.keywords
Exemple #31
0
def grab_news_from_RSS(inputpath, outpath, opn='write'):

    all_links = []
    jlines = []

    with open(inputpath, 'r') as filer:
        lines = filer.readlines()
        lines = [line.strip() for line in lines]

    # for line in lines:
    #     temp_list = get_links_from_rss_feed(line)
    #     if temp_list != []:
    #         all_links.append(temp_list)

    # print("Done Parsing XML_Files")

    with jsonlines.open(outpath, 'w') as op:
        for i_1, rss_link in enumerate(lines):
            print('processing RSS link ', i_1)
            list_al = get_links_from_rss_feed(rss_link)
            for i, link in enumerate(list_al):
                # print(link)
                try:
                    t1 = time.time()
                    article = Article(link)
                    article = timeout_setter(article, 100)
                    article.download()
                    article.parse()
                    article.nlp()
                    temp_dict = {}
                    temp_dict["authors"] = article.authors
                    if article.publish_date:
                        temp_dict[
                            "publish_date"] = article.publish_date.strftime(
                                "%m/%d/%Y, %H:%M:%S")
                    else:
                        temp_dict['publish_date'] = 'nil'
                    temp_dict["text"] = article.text
                    temp_dict["keywords"] = article.keywords
                    temp_dict["summary"] = article.summary
                    temp_dict['url'] = article.url
                    temp_dict['rss_link'] = rss_link
                    temp_dict['title'] = article.title
                    t2 = time.time()

                    if article.title != '':
                        if (detect(article.title) != 'en'):
                            continue  ## or whatever thing you wish to do in this case
                    if article.text != '':
                        if (detect(article.text) != 'en'):
                            continue  ## or whatever thing you wish to do in this case
                    if (i % 50 == 0):
                        print('time taken', str(t2 - t1))
                    if (opn == 'write'):
                        op.write(temp_dict)
                    else:
                        jlines.append(temp_dict)
                except Exception as E:
                    print(E)
                    print(traceback.format_exc())
                    pass
    if (opn != 'write'):
        return jlines
Exemple #32
0
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.findAll('a', attrs={'class': 'w_img'})
news = []
for row in table:
    if not row['href'].startswith('http'):
        news.append('https://timesofindia.indiatimes.com' + row['href'])

import nltk
nltk.download('punkt')
df = []
for i in news:
    article = Article(i, language="en")
    article.download()
    article.parse()
    article.nlp()
    data = {}
    data['Title'] = article.title
    data['Text'] = article.text
    data['Summary'] = article.summary
    data['Keywords'] = article.keywords
    df.append(data)

dataset = pd.DataFrame(df)
dataset.head()
FILEPATH = r"C:\Users\pavan\Downloads\crawl.csv"


def TrainTestSplit(X, Y, R=0, test_size=0.2):
    return train_test_split(X, Y, test_size=test_size, random_state=R)
Exemple #33
0
from newspaper import Article

#A news article from this website
url = "http://www.thehindu.com/opinion/lead/entering-the-age-of-gst/article19189469.ece"

#For different language newspaper refer above table
news_article = Article(url, language="en")  # en for English

#To download the article
news_article.download()

#To parse the article
news_article.parse()

#To perform natural language processing ie..nlp
news_article.nlp()

#To extract title
print("Article's Title:")
print(news_article.title)
print("\n")

#To extract text
print("Article's Text:")
print(news_article.text)
print("\n")

#To extract summary
print("Article's Summary:")
print(news_article.summary)
print("\n")
Exemple #34
0
    def newspaper_parser(self, sleep_time=0):
        print 'running newspaper_parser()...'
        results = []
        count = 0

        profile = webdriver.FirefoxProfile()
        browser = webdriver.Firefox(profile)
        credential_names = self.credentials.keys()

        browser.get(self.login_url)
        cred1 = browser.find_element_by_id(credential_names[0])
        cred2 = browser.find_element_by_id(credential_names[1])
        cred1.send_keys(self.credentials[credential_names[0]])
        cred2.send_keys(self.credentials[credential_names[1]])
        browser.find_element_by_id(self.submit_id).click()
        time.sleep(15)

        cookies = browser.get_cookies()
        browser.close()

        s = requests.Session()
        for cookie in cookies:
            s.cookies.set(cookie['name'], cookie['value'])

        for l in self.links:
            page = s.get(l)
            soup = BeautifulSoup(page.content)
            article = Article(url=l)
            article.set_html(str(soup))

            try:
                article.parse()
                article.nlp()
            except:
                time.sleep(60)
                continue

            data = {
                'title': article.title,
                'date_published': article.publish_date,
                'news_outlet': self.newspaper,
                'authors': article.authors,
                'feature_img': article.top_image,
                'article_link': article.canonical_link,
                'keywords': article.keywords,
                'movies': article.movies,
                'summary': article.summary,
                'text': article.text,
                'html': article.html
            }

            print data['title']
            print data['text']
            print
            print
            results.append(data)
            time.sleep(sleep_time)

            count += 1
            print count

        return results
Exemple #35
0
def select_scraper_zero_tasks():
    # database = "/home/zihua/macury/MercuryChallenge/scraper/articles.db"
    database = "/home/zihua/macury/MercuryChallenge/jsonToSql/mercury.db"
    #connect to the database
    con = sqlite3.connect(database)
    # counter = 0
    cur = con.cursor()

    #select event id and url from the sqlite table
    cur.execute(
        "select event_id, first_reported_link from cu_gsr_event group by first_reported_link having count(event_id)>0 order by count(first_reported_link) desc;"
    )

    #fetch the sql query with all search results
    rows = cur.fetchall()

    print(len(rows))

    # for each event id and url under the search query
    for event_id, url in rows:
        scraper = cfscrape.create_scraper(url)

        # print(event_id,', ', url)
        try:
            # scrape the url content
            gold = scraper.get(url).content
            print("URL scraped")
        except:
            # print the URL that was failed to visit
            print("Failed at ", url)
            continue

        #article function to parse the arabic webpage, especially in Arabic
        article = Article(url, memoize_articles=False, language='ar')

        #download html page by using scraper library
        try:
            article.download(input_html=gold)

            #This is another bug from the source code. Hope this solve the problem.
            #The reason behind this is some website has "deformed" format, and it takes a period of time to visit it.
            if article.download_state != 2:  # ArticleDownloadState.SUCCESS is 2
                time.sleep(1)
                article.parse()
            else:
                #parse the html page
                article.parse()

            #add sleep time here
            # time.sleep(1)

            #apply article nlp function
            article.nlp()

            #event id is given
            eventid = event_id

            #Troublemaker is Here! Article gives this value as a list,
            #but Sqlite Table does not take list as text value
            author = article.authors

            #I use join function to combine the list to be a string
            authors = " ".join(str(x) for x in author)

            #decide not to add current parsing time.
            #the publish date function gets publish date
            #depend upon the capability of this function to get publish date
            publish_date = article.publish_date
            # date = article.publish_date.date()

            #it gets title if possible
            title = article.title

            #it gets article main content
            content = article.text

            #Again, another troublemaker here! It works after combine list to be a string
            keyword = article.keywords
            if keyword is None:
                keywords = keyword
            else:
                keywords = ' '.join(str(e) for e in keyword)

            #Article summary, but assume it's same as text, which should be the main content
            summary = article.summary

            if title:
                print(title)
                # print(keywords)
            elif keywords:
                print("K Perceived")

            elif summary:
                print("S Perceived")

            elif content:
                print("C Perceived")

            else:
                print("Skipped, No Title or Any Other Needed Info")
                continue

            try:
                con2 = sqlite3.connect(database)
                # con = sqlite3.connect(database)
                with con2:
                    cur2 = con2.cursor()
                    #insert all attributes to the sqlite table
                    cur2.execute(
                        'INSERT INTO article_info (Event_ID, Authors, Publish_Date, Content, Keywords, Summary, Title) VALUES (?,?,?,?,?,?,?)',
                        (eventid, authors, publish_date, content, keywords,
                         summary, title))
                    # call commit on the connection...
                    con2.commit()

            except Error as e:
                print(event_id + " not successful. Error: " + database)
                pass

        except:
            print("This File May Not Been Downloaded!")
            pass
links = []
for i in articles:
    if i.find('a')['href'] == '#':
        continue
    else:
        links.append(i.find('a')['href'])

all_row_list = []
i = 1
for link in links[:25]:
    print("Scraping: " + link)

    news = Article(link)
    news.download()
    try:
        news.parse()
        news.nlp()
        kategori = link.split('/')[3]

        row_list = [
            i, 'detik_inet', news.publish_date, news.title,
            news.text.replace("\n", ""), kategori
        ]
        i += 1
        all_row_list.append(row_list)

    except Exception:
        pass

writeToCsv(all_row_list)
def home_page():

    # Scrape and parse textual content from web resource. This method employs Article from Newspaper3k library to download and parse html from the web resource. It uses heuristics to scrape main body of visible text.
    # :param url: Uniform Resource Locator.
    # :return: Scraped content of web resource.

    user_input = st.text_input('Enter URL of an article or text')

    with open(get_data_path('fake_news_sites.json')) as json_file:
        fake_news_db_news = json.load(json_file)

    with open(get_data_path('categories.json')) as json_file:
        categories = json.load(json_file)

    with open(get_data_path('opensources/sources.json')) as json_file:
        open_source_json = json.load(json_file)

    try:
        # Get domain name from the url
        domain_name = get_domain(user_input)

        # Get formated domain
        formated_domain = format_url(domain_name)

    except Exception:
        st.warning("Enter an URL to suppress the warning !!")

    try:
        my_article = Article(user_input, language="en", keep_article_html=True)
        my_article.download()
        slept = 0
        while my_article.download_state == ArticleDownloadState.NOT_STARTED:
            # Raise exception if article download state does not change after 10 seconds
            if slept > 9:
                raise ArticleException('Download never started')
            sleep(1)
            slept += 1
        my_article.parse()
    except Exception as ec:
        print(ec)

    if st.button('Check authenticity'):
        st.header("VirusTotal - Malicious URL Scanner (virustotal.com)")
        st.markdown('''---''')
        with st.spinner(text="Fetching measures - Analysis in progress"):
            # task = asyncio.create_task(scan_url(user_input))
            # json_data = await task
            json_data = scan_url(user_input=user_input)
            if json_data is not None:
                category_key = list(json_data.keys())
                category_value = [json_data[i]['result'] for i in category_key]
                left, center, right = st.beta_columns((1, 2, 1))

                with left:
                    left.markdown('''**No.** ''', unsafe_allow_html=True)
                    for i in range(1, 21):
                        left.write(i)
                with center:
                    center.markdown('''**Detected by**''',
                                    unsafe_allow_html=True)
                    for i in category_key[:20]:
                        center.write(i)
                with right:
                    right.markdown('''**Result**''', unsafe_allow_html=True)
                    for link in category_value[:20]:
                        if link == 'clean':
                            right.markdown(
                                f'<span style="color:green">clean site</span>',
                                unsafe_allow_html=True)
                        else:
                            right.markdown(
                                f'<span style="color:red">{link}</span>',
                                unsafe_allow_html=True)
            else:
                st.warning(
                    "Couldn't able to get detect the site or Invalid URL provided !!"
                )

        st.header("News site authencity")
        st.markdown('''---''')

        left, right = st.beta_columns((1, 2))
        res = get_opensource_news(domain_name, formated_domain,
                                  open_source_json)
        left.markdown('''**Source** : OpenSource http://www.opensources.co/''',
                      unsafe_allow_html=True)
        right.markdown(f'**Checking Domain** : {domain_name}',
                       unsafe_allow_html=True)
        if res is None:
            right.warning("URL is not found in OpenSource Database")
        else:
            right.markdown(f'**Category** : {res["type"]}',
                           unsafe_allow_html=True)
            try:
                right.markdown(f'**Discription** : {categories[res["type"]]}',
                               unsafe_allow_html=True)
            except:
                right.warning("Category Discription isn't available !!")
            if res["Source Notes (things to know?)"]:
                right.markdown(
                    f'**Source Notes (things to know?)** : {res["Source Notes (things to know?)"]}',
                    unsafe_allow_html=True)

        st.markdown('''---''')
        left1, right1 = st.beta_columns((1, 2))
        res1 = get_fb_news_data(domain_name, formated_domain,
                                fake_news_db_news)

        left1.markdown('''**Source** : FakeNews Site DB''',
                       unsafe_allow_html=True)
        right1.markdown(f'**Checking Domain** : {domain_name}',
                        unsafe_allow_html=True)
        if res1 is None:
            right1.warning("URL is not found in Fake news site database")
        else:
            try:
                right1.markdown(f'**Category** : {res1["siteCategory"]}',
                                unsafe_allow_html=True)
                right1.markdown(f'**Site name** : {res1["siteTitle"]}',
                                unsafe_allow_html=True)
                if type(res1["siteCategory"]) is list:
                    right1.markdown(
                        f'**Discription** : {categories[res1["siteCategory"][0]]}',
                        unsafe_allow_html=True)
                else:
                    right1.markdown(
                        f'**Discription** : {categories[res1["siteCategory"]]}',
                        unsafe_allow_html=True)

                if res1["siteNotes"]:
                    right1.markdown(
                        f'**Source Notes (things to know?)** : {res1["siteNotes"]}',
                        unsafe_allow_html=True)
            except Exception:
                st.warning("Category is not available for this site !!")

            if res1["siteCategory"] == 'reliable':
                st.success(
                    "This is a trusted news site, which means the claim and article published on this site is transparent, authentic, trustworthy, complete, and in the absence of biases, it also protects audiences and users from disinformation."
                )
            else:
                st.error(
                    "This news site is not reliable or not authentic, the information published by this site might not be true !!"
                )

        st.markdown('''### **Article Title**''')
        # st.header(Article Title)
        title = my_article.title
        if title:
            st.markdown(f'{title}')
        else:
            st.warning(
                "Coudn\'t able extract the title or Invalid URL Provided")

        st.markdown('''### **Article Authors **''')
        author = my_article.authors
        if len(author) != 0:
            # st.markdown(f'{author}')
            st.markdown(
                f'<span style="background-color:#00C4EB;border-radius:5px;box-shadow: 0 5px 0 rgb(0, 116, 191);color: #FFFFFF;padding: 0.5em 1em;position: relative;text-decoration: none;font-weight:bold;cursor: pointer;">{author[0]}</span>',
                unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able extract the author name or Invalid URL Provided"
            )

        st.markdown('''### **Publish Date**''')
        date = my_article.publish_date
        if date:
            st.info(f'{date} ')
        else:
            st.warning(
                "Coudn\'t able extract the publish date or Invalid URL Provided"
            )

        st.markdown('''### **Image**''')
        image_url = my_article.top_image
        if image_url:
            st.image(image_url, caption="Article Top Image")
            st.markdown(
                f'''<p align="center"><b> Source URL : <b><a href="{ image_url }">{ image_url }</a></p>''',
                unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able extract the Image or Invalid URL Provided or No image is present"
            )

        st.markdown('''### **Article Text**''')
        article_text = my_article.text
        if article_text:
            with st.beta_expander(
                    "🧙 Click here for more info about the article 🔮"):
                st.markdown(f'{article_text}', unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able extract the publish article or Invalid URL Provided"
            )

        st.markdown('''### **Movies / Videos**''')
        videos = my_article.movies
        if videos:
            st.video(videos[0])
        else:
            st.warning(
                "Coudn\'t able extract the publish videos or No videos were published or Invalid URL Provided "
            )

        try:
            my_article.nlp()
        except Exception as ec:
            st.error(ec)
        # except ArticleException:
        #     st.error("Article Exception Occured !!")

        st.markdown('''### **Keywords (NLP)**''')
        nlp_keywords = my_article.keywords
        if nlp_keywords:
            st.info(nlp_keywords)
        else:
            st.warning(
                "Coudn\'t able to get the top keywords or Invalid URL Provided"
            )

        st.markdown('''### **Summary (NLP)**''')
        nlp_summary = my_article.summary
        if nlp_summary:
            st.markdown(f'{nlp_summary}', unsafe_allow_html=True)
        else:
            st.warning(
                "Coudn\'t able to get the summary of the article or Invalid URL Provided"
            )

        st.header("News article veracity")
        st.markdown('''---''')

        if article_text is not None:

            with st.spinner(text="Inference is in Progress ⏳ ..."):
                output_label = asyncio.run(
                    model_service.predict_from_server(article_text))
                # left,right = st.beta_columns((1,2))
                st.markdown(
                    '''**Analysis based on:** : Artificial intelligence''')
                st.markdown(
                    '''**Notes:** WARNING: This result may be inaccurate! This domain wasn't categorised on any human maintained list thus analysis was performed by machine learning model.'''
                )
                if output_label:
                    st.markdown(f'Predicted label : {output_label}',
                                unsafe_allow_html=True)
                    st.success("Real news")
                else:
                    st.markdown(f'Predicted label : {output_label}',
                                unsafe_allow_html=True)
                    st.error("Fake news")
            st.balloons()
        else:
            st.warning(
                "Article text is not found, hence news article veracity analysis is incomplete !!"
            )
article.download()

#To parse the article
article.parse()

#To extract title
print("Article's Title:")
print(article.title)
print("\n")

#To extract text
print("Article's Text:")
print(article.text)
print("\n")

article.nlp()

#To extract summary
print("Article's Summary:")
print(article.summary)
print("\n")

#To extract keywords
print("Article's Keywords:")
print(article.keywords)

#Chinese newspaper
from newspaper import Article

#News article from the sinchew (Chinese Newspaper)
url = 'https://www.sinchew.com.my/content/content_2058387.html'
import nltk
from newspaper import Article
import sys
import os.path
from os import path

#Get the article
url = str(sys.argv[1])
mFilePath = str(sys.argv[2])
article = Article(url)

# Do some NLP
article.download()  #Downloads the link’s HTML content
article.parse()  #Parse the article
nltk.download('punkt')  #1 time download of the sentence tokenizer
article.nlp()  #  Keyword extraction wrapper

text = article.summary
print(text)

obj = TextBlob(text)
#returns the sentiment of text
#by returning a value between -1.0 and 1.0
sentiment = obj.sentiment.polarity

if path.exists(mFilePath):
    Html_file = open(mFilePath, "a", newline='')
    Html_file.write('\n')
    Html_file.write(str(sentiment))
    Html_file.close()
else:
Exemple #40
0
    date, time = time.split('T')
    time = time[:-1]
    time = f"{date} {time} +0000"
    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S %z')
    time = time.astimezone(pytz.timezone('Asia/Kolkata'))

    # Create link
    link = item.a['href']
    link = ''.join(["https://news.google.com", link[1:]])

    # Use Article from newspaper
    try:
        art = Article(link, language="en")
        art.download()
        art.parse()
        art.nlp()

        # Create main data
        data.append(f"Article's Date :- {time.date()} {time.time()}")
        data.append(f"Article's Title :- {title.text}")
        data.append(f"Link for article :- {link}")
        data.append(f"Summary :- \n{art.summary}")

        # Write to main news csv
        main_writer.writerow(
            [f"{time.date()} {time.time()}", title.text, art.summary, link])
        main_data.append('\n\n'.join(data))

    except:
        print(
            f"\nLink to \"{title.text}\" does not work or there is a connection error"
Exemple #41
0
    penalties[data] = (
        abs(url_data[data][4] - ultimateMeanVibe)
    ) * meanVibeWeight  # data piece will be anywhere from 0 to 18 penalty, where it normally does not exceed 9
    date_time_written = datetime.datetime.strptime(dates[urls.index(data)],
                                                   '%Y-%m-%d %H:%M:%S')
    deltadatetime = date_time_now - date_time_written
    penalties[data] += (deltadatetime.days / 14) * numDaysWeight
    penalties[data] += max(
        -.5, -((url_data[data][2] / url_data[data][0]) * quoteWeight))

    keywordsInTitle = 0
    titleWords = titles[urls.index(data)].split()
    a = Article(data)
    a.download()
    a.parse()
    a.nlp()
    for keyword in a.keywords:
        for word in titleWords:
            if (keyword == word):
                keywordsInTitle += 1
    penalties[data] += -(keywordsInTitle * titleWordWeight)

newDic = {k: v for k, v in sorted(penalties.items(), key=lambda item: item[1])}
newerDic = {}

newerDicVals = 0
for key in newDic:
    if (newerDicVals > 49):
        break
    newerDic[key] = newDic[key]
    newerDicVals += 1
Exemple #42
0
def scrape(query, month=""):

    last_day = 31
    time = ""

    if (month[-2:] == '02'):
        last_day = 28
    elif (month[-2:] in ['04', '06', '09', '11']):
        last_day = 30

    if (month != ""):
        time = "after:" + month + "-01" + " AND before:" + month + "-" + str(
            last_day)  #last_day > 10 so no date formatting issues
    news = "https://news.google.com"
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    }
    page = requests.get("https://news.google.com/search?q=" + query + " " +
                        time,
                        headers=headers)
    soup = BeautifulSoup(page.content, features="lxml")
    found = soup.findAll("a")
    links = [x.get('href') for x in found]
    temp = []
    redirects = []
    articles = []
    count = 0
    for l in links:
        if (type(l) == str):
            if 'article' in l:
                if l not in redirects:
                    redirects.append(l)
                    count += 1

    redirects = [x[1:] for x in redirects]

    for a in redirects:
        try:
            r = requests.get(news + a, timeout=10)
            articles.append(r.url)
        except Exception as e:
            print(e)

    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent

    a = None
    data = []
    for url in articles:
        try:
            a = Article(url, config=config)
            a.download()
            a.parse()
            a.nlp()

            d = {
                "url": url,
                "title": a.title,
                "authors": a.authors,
                "date": a.publish_date.date(),
                "keywords": a.keywords,
                "summary": a.summary,
                #"text": a.text
            }
            data.append(d)

            #file.write(url + ";" + a.title + ";" + a.authors + ";" + a.publish_date.date() + ";" + a.summary.encode('unicode-escape') + ";" + a.text.encode('unicode-escape'))

            #print(a.title, a.authors, a.publish_date.date(), a.keywords)
            #print(a.summary)
            #print('*'*100)
        except Exception as e:
            print("Error with", url)
            print(a.title)
            print(e)
            print('*' * 100)

    return data
Exemple #43
0
import csv

#A new article from TOI
url = "https://www.programiz.com/python-programming/working-csv-files"

#For different language newspaper refer above table
toi_article = Article(url, language="en")  # en for English

#To download the article
toi_article.download()

#To parse the article
toi_article.parse()

#To perform natural language processing ie..nlp
toi_article.nlp()

#To extract title
print("Article's Title:")
p = toi_article.text
print(p)
print("nn")

#To extract text
print("Article's Text:")
c = toi_article.text
print(c)
print("nn")

#To extract summary
print("Article's Summary:")
Exemple #44
0
def thevergeArticles(i, queue, debug, minDelay, maxDelay, b):
    while True:
        try:
            url = queue.get()
        except:
            time.sleep(1)
            continue
        if url is None:
            break
            sys.exit(1)

        if debug:
            sys.stdout.write('Visting publication url :: ' + url + '\n' + '\n')

        logging.info('Visting publication url :: ' + url + '\n' + '\n')

        article = Article(url)
        article.download()
        article.parse()
        try:
            authors = article.authors
        except:
            authors = ''
        try:
            publish_date = str(
                article.publish_date.replace(second=0).isoformat().replace(
                    ':00+00:00', '+00:00'))
            publish_date = article.publish_date
        except:
            publish_date = ''
        try:
            date = str(
                datetime(publish_date.year,
                         publish_date.month,
                         publish_date.day,
                         tzinfo=TZ()).isoformat()).replace(
                             ':00+00:00', '+00:00')
        except Exception, e:
            print str(e)
            date = ''
        print date
        try:
            text = article.text
        except:
            text = ''
        try:
            top_image = article.top_image
        except:
            top_image = ''
        try:
            movies = article.movies
        except:
            movies = ''
        article.nlp()
        try:
            keywords = article.keywords
        except:
            keywords = ''
        try:
            summary = article.summary
        except:
            summary = ''
        if summary == '':
            summary = trimArticle(text, 50)
        images = {}
        try:
            all_images = article.images
            if len(all_images) > 0:
                for i in range(len(all_images)):
                    images['image_' + str(i)] = all_images[i]

        except:
            pass
        try:
            abstract = article.meta_description
        except:
            abstract = ''
        try:
            title = article.title
        except:
            title = ''

        # write the fellow summary to file
        file_name = 'theverge_' + title.replace(' ', '-') + '.json'
        file_name = ''.join(c for c in file_name if c in valid_chars)

        if os.name == 'nt':
            f = open('success//' + file_name, 'wb')
        else:
            f = open('success/' + file_name, 'wb')
        folder = 'success'
        logging.info('Opened ' + 'success//' + file_name + '.json' +
                     ' for writing')

        data = {
            'abstract': summary,
            'external_id': 'theverge_' + title.replace(' ', '-'),
            'date': date,
            'url': url,
            'title': title,
            'words': text,
            'meta': {
                'theverge': {
                    'keywords': str(keywords),
                    'top_image': top_image,
                    'authors': authors,
                    'authors': authors,
                    'allImages': str(images)
                }
            }
        }

        f.write(json.dumps(data))
        f.close()
        logging.info('File written ' + file_name)
        if os.name == 'nt':
            uploadDataS3(folder + '//' + file_name, b)
        else:
            uploadDataS3(folder + '/' + file_name, b)
        if debug:
            sys.stdout.write(file_name + ' has been written to S3 bucket' +
                             '\n')
        logging.info(file_name + ' has been written to S3 bucket' + '\n')

        if debug:
            sys.stdout.write(file_name + ' written' + '\n')
        wait_time = random.randint(minDelay, maxDelay)
        sys.stdout.write('Sleeping for :: ' + str(wait_time) + '\n')
        logging.info('Sleeping for :: ' + str(wait_time) + '\n')
        sys.stdout.write('******************************************' + '\n')
        sys.stdout.write('******************************************' + '\n')
        time.sleep(wait_time)
Exemple #45
0
def scrape_reddit(reddit, engine, limit_, yest):
    try:
        i = 0
        for submission in reddit.subreddit('news').hot(limit=limit_):
            if (submission.created > yest):
                query_comments = '''SELECT EXISTS(SELECT * FROM MemeNews.every_comment  WHERE post_id LIKE '{0}' LIMIT 1)'''.format(
                    submission.id)
                query_articles = '''SELECT EXISTS(SELECT * FROM MemeNews.Daily_Articles  WHERE id LIKE '{0}' LIMIT 1)'''.format(
                    submission.id)
                if (engine.execute(query_articles).fetchone()[0]):
                    continue
                submission.comment_sort = 'best'
                article = Article(submission.url)
                try:
                    article.download()
                    article.parse()
                    article.nlp()
                    article.fetch_images()
                except:
                    continue
                articles_dict = {
                    "title":
                    re.sub(r'[^\x00-\x7F]', '',
                           submission.title.replace('"', "'")),
                    "score":
                    submission.score,
                    "id":
                    submission.id,
                    "url":
                    submission.url,
                    "comms_num":
                    submission.num_comments,
                    "created":
                    submission.created,
                    "body":
                    re.sub(r'[^\x00-\x7F]', '', article.text.replace('"',
                                                                     "'")),
                    "image":
                    article.top_image,
                    "keywords":
                    ', '.join(article.keywords).replace('"', "'"),
                    "summary":
                    re.sub(r'[^\x00-\x7F]', '',
                           article.summary.replace('"', "'"))
                }
                #add articles
                articles_data = pd.DataFrame(articles_dict, index=[i])
                articles_data.to_sql('Daily_Articles',
                                     con=engine,
                                     if_exists='append',
                                     dtype={'None': VARCHAR(5)})
                print("article added with url: ", submission.url)
                if (engine.execute(query_comments).fetchone()[0]):
                    continue
                comment_dict = {
                    "post_id": [],
                    'post_title': [],
                    "id": [],
                    "author": [],
                    "body": [],
                    "created": [],
                    'score': [],
                    'is_submitter': [],
                    'parent_id': []
                }
                for top_level_comment in submission.comments.list()[:100]:
                    try:
                        comment_dict['is_submitter'].append(
                            top_level_comment.is_submitter)
                        comment_dict['post_id'].append(submission.id)
                        comment_dict['id'].append(top_level_comment.id)
                        comment_dict['author'].append(top_level_comment.author)
                        comment_dict['body'].append(
                            re.sub(r'[^\x00-\x7F]', '',
                                   top_level_comment.body))
                        comment_dict['score'].append(top_level_comment.score)
                        comment_dict['created'].append(
                            top_level_comment.created_utc)
                        comment_dict['parent_id'].append(
                            top_level_comment.parent_id)
                        comment_dict['post_title'].append(submission.title)
                    except:
                        continue
                comment_data = pd.DataFrame(comment_dict)
                comment_data.to_sql('every_comment',
                                    con=engine,
                                    if_exists='append',
                                    dtype={'None': VARCHAR(5)})
                print("comments added")
                i += 1
        return 1
    except err:
        print(err)
        return 0
    def handle(self, url, website, *args, **options):
        config = Config()
        config.browser_user_agent = user_agent

        if website:
            links = pagelinks.objects.filter(
                Q(fetched=False) & Q(site__name=website[0])).values_list(
                    'url', flat=True)
        elif url:
            links = pagelinks.objects.filter(Q(url=url[0])).values_list(
                'url', flat=True)
        else:
            links = pagelinks.objects.filter(fetched=False).values_list(
                'url', flat=True)

        l = len(links)
        j = 0

        print(str(l) + ' link(s)')
        printProgressBar(0,
                         l,
                         prefix='Progress: ',
                         suffix='Complete',
                         length=100)

        for link in links:
            print(str(link) + '\n')
            print(str(l - j) + ' remaining..')

            try:
                an_article = Article(url=link, config=config)
                an_article.download()
                an_article.parse()
                an_article.nlp()

                if len(an_article.text) > 999:
                    pagelinks.objects.filter(url=link).update(
                        fetched=True,
                        body=an_article.text,
                        publish_date=an_article.publish_date,
                        top_image=an_article.top_image,
                        authors=an_article.authors,
                        videos=an_article.movies,
                        keywords=an_article.keywords,
                        summary=an_article.summary,
                        title=an_article.title,
                        is_article=True)
                else:
                    pagelinks.objects.filter(url=link).update(
                        fetched=True,
                        body=an_article.text,
                        publish_date=an_article.publish_date,
                        top_image=an_article.top_image,
                        authors=an_article.authors,
                        videos=an_article.movies,
                        keywords=an_article.keywords,
                        summary=an_article.summary,
                        title=an_article.title)

                j += 1
                printProgressBar(j + 1,
                                 l,
                                 prefix='Progress: ',
                                 suffix='Complete',
                                 length=100)

            except Exception as e:
                print(e)
                continue
Exemple #47
0
def build_db():
    fn = os.path.join(os.path.dirname('__file__'), '../panel/panel.json')
    db = {}
    db['snippets'] = []

    with open(fn, "r") as json_file:
        panel = json.loads(json_file.read())

        with open("database.json", "w") as outfile:
            i = 0
            for key in panel:
                i += 1
                j = 0
                print 'looking at {}, {}/{} keywords'.format(key, i, len(panel))

                for pundit in panel[key]:
                    j += 1

                    print '\tlooking at {}, {}/{} pundits'.format(pundit['name'].encode('ascii', 'ignore'), j,
                                                                  len(panel[key]))

                    if pundit['links']['brookings']:
                        response = requests.get(pundit['links']['brookings'])
                        soup = bs4.BeautifulSoup(response.text, "html.parser")
                        for link in soup.select('ul.media-list li div.content h3.title a'):
                            snippet = {}
                            link_href = str(link.attrs.get('href'))
                            url = "http://www.brookings.edu" + link_href

                            try:
                                link_response = requests.get(url)
                                soup = bs4.BeautifulSoup(link_response.text, "html.parser")

                                try:
                                    full_url_link = soup.select('div.article-detail em a')[0]
                                    full_url = str(full_url_link.attrs.get('href'))
                                except IndexError:
                                    full_url = url

                                if 'pdf' not in full_url and validate_url(full_url):
                                    article = Article(url)

                                    try:
                                        article.download()
                                        article.parse()
                                        article.nlp()

                                        print '\t\t', full_url
                                        snippet["text"] = article.text
                                        snippet["summary"] = article.summary
                                        snippet["url"] = url
                                        snippet["full_url"] = full_url
                                        snippet["keywords"] = article.keywords

                                        snippet["pundit"] = {}
                                        snippet["pundit"]["name"] = pundit["name"]
                                        snippet["pundit"]["title"] = pundit["title"]
                                        db['snippets'].append(snippet)
                                    except ArticleException():
                                        pass
                            except requests.exceptions.ConnectionError:
                                pass

                    if pundit['links']['cfr']:
                        response = requests.get(pundit['links']['cfr'] + "#publications")
                        soup = bs4.BeautifulSoup(response.text, "html.parser")

                        for link in soup.select('div#publications article.publication_spotlight h3 a'):
                            snippet = {}
                            link_href = str(link.attrs.get('href'))

                            try:
                                if "http" in link_href:
                                    url = link_href
                                    full_url = link_href
                                else:
                                    url = "http://www.cfr.org" + link_href
                                    full_url_link = soup.find(text='View full text of article')

                                    if full_url_link:
                                        full_url = str(full_url_link.parent.attrs.get('href'))
                                    else:
                                        full_url = url

                                    link_response = requests.get(url)
                                    soup = bs4.BeautifulSoup(link_response.text, "html.parser")

                                if 'pdf' not in full_url and validate_url(full_url):
                                    article = Article(full_url)
                                else:
                                    article = Article(url)

                                try:
                                    article.download()
                                    article.parse()
                                    article.nlp()

                                    print '\t\t', url
                                    snippet["text"] = article.text
                                    snippet["summary"] = article.summary
                                    snippet["url"] = url
                                    snippet["full_url"] = full_url
                                    snippet["keywords"] = article.keywords

                                    snippet["pundit"] = {}
                                    snippet["pundit"]["name"] = pundit["name"]
                                    snippet["pundit"]["title"] = pundit["title"]
                                    db['snippets'].append(snippet)
                                except ArticleException:
                                    pass
                            except requests.exceptions.ConnectionError:
                                pass

                    if pundit['links']['baker']:
                        response = requests.get(pundit['links']['baker'])
                        soup = bs4.BeautifulSoup(response.text, "html.parser")

                        for link in soup.select('h3#library ul li a'):
                            snippet = {}
                            link_href = str(link.attrs.get('href'))

                            try:
                                url = "http://www.bakerinstitute.org" + link_href
                                full_url_link = soup.select('div.research_content div.researchContent span a')[-1]

                                if full_url_link:
                                    full_url = str(full_url_link.parent.attrs.get('href'))
                                else:
                                    full_url = url

                                link_response = requests.get(url)
                                soup = bs4.BeautifulSoup(link_response.text, "html.parser")

                                if 'pdf' not in full_url and validate_url(full_url):
                                    article = Article(full_url)
                                else:
                                    article = Article(url)

                                try:
                                    article.download()
                                    article.parse()
                                    article.nlp()

                                    print '\t\t', url
                                    snippet["text"] = article.text
                                    snippet["summary"] = article.summary
                                    snippet["url"] = url
                                    snippet["full_url"] = full_url
                                    snippet["keywords"] = article.keywords

                                    snippet["pundit"] = {}
                                    snippet["pundit"]["name"] = pundit["name"]
                                    snippet["pundit"]["title"] = pundit["title"]
                                    db['snippets'].append(snippet)
                                except ArticleException:
                                    pass
                            except requests.exceptions.ConnectionError:
                                pass

                    if pundit['links']['ecfr']:
                        response = requests.get(pundit['links']['ecfr'])
                        soup = bs4.BeautifulSoup(response.text, "html.parser")

                        for link in soup.select('ul#all li div.post div.list-content a'):
                            snippet = {}
                            link_href = str(link.attrs.get('href'))

                            try:
                                url = "http://www.ecfr.eu" + link_href
                                full_url = url

                                link_response = requests.get(url)
                                soup = bs4.BeautifulSoup(link_response.text, "html.parser")

                                if 'pdf' not in full_url and validate_url(full_url):
                                    article = Article(full_url)
                                else:
                                    article = Article(url)

                                try:
                                    article.download()
                                    article.parse()
                                    article.nlp()

                                    print '\t\t', url
                                    snippet["text"] = article.text
                                    snippet["summary"] = article.summary
                                    snippet["url"] = url
                                    snippet["full_url"] = full_url
                                    snippet["keywords"] = article.keywords

                                    snippet["pundit"] = {}
                                    snippet["pundit"]["name"] = pundit["name"]
                                    snippet["pundit"]["title"] = pundit["title"]
                                    db['snippets'].append(snippet)
                                except ArticleException:
                                    pass
                            except requests.exceptions.ConnectionError:
                                pass


            json.dump(db, outfile, indent=4)
Exemple #48
0
class ArticleScraper(Article):
    """ For a given article url, it downloads and parses some specific data and writes a JSON in the output_file """
    def __init__(self, url, timestamp, newspaper):
        """ Initialize ArticleScraper """
        self.article_obj = {}
        self.article_obj["url"] = url
        self.article_obj["newspaper"] = newspaper
        self.article_obj["timestamp"] = timestamp

        if self.article_obj:
            # initiate article
            self.article = Article(url, language="es")
            # parse article
            # self.parse_article()

    def parse_article(self):
        """ Download, Parse and NLP a given article """
        try:
            # download source code
            self.article.download()

            # parse code
            self.article.parse()

            # populate article obj with parsed data
            try:
                self.article_obj["title"] = self.article.title
                # self.article_obj["title"] = self.article.title.encode("utf-8").strip()
            except:
                self.article_obj["title"] = ""

            try:
                self.article_obj["authors"] = self.article.authors
            except:
                self.article_obj["authors"] = ""

            try:
                self.article_obj["publish_date"] = self.article.publish_date
                # self.article_obj["publish_date"] = self.article.publish_date.encode("utf-8").strip()
            except:
                self.article_obj["publish_date"] = ""

            try:
                self.article_obj["text"] = self.article.text
                # self.article_obj["text"] = self.article.text.encode("utf-8").strip()
            except:
                self.article_obj["text"] = ""

            try:
                self.article_obj["top_image"] = self.article.top_image
            except:
                self.article_obj["top_image"] = ""

            self.article.nlp()

            try:
                self.article_obj["summary"] = self.article.summary
            except:
                self.article_obj["summary"] = ""

            try:
                self.article_obj["keywords"] = self.article.keywords
            except:
                self.article_obj["keywords"] = []

            # print(self.article_obj)
            return self.article_obj

        except:
            pass
Exemple #49
0
def process(update, context):
    if update.message:
        text = update.message.text
    else:
        return
    links = find(text)
    # handling for groups, when message has no links
    if not links:  # and update.message.chat.type == "super_group":
        return
    link = links[0]
    # try:
    #     link = links[0]
    # except:
    #     update.message.reply_text("Oh! Send a valid link.")
    article = Article(link)
    article.download()
    article.parse()
    try:
        author = "✍ *Author:* " + article.authors + "\n"
    except:
        author = ""
    date = "📅 *Publication Date:* "
    try:
        date += str(article.publish_date.strftime('%Y-%m-%d'))
    except:
        if article.publish_date is None:
            date = ""
        else:
            date += str(article.publish_date)
    value = article.html
    tree = fromstring(value)
    title = str(tree.findtext('.//title'))
    lang = translator.detect(title).lang
    if lang != 'en':
        text = translate(link)
        if text == 'null':
            return
        update.message.reply_text(text)
        link = find(text)[0]
        article = Article(link)
        article.download()
        article.parse()
    text = article.text
    soup = bs(value, 'lxml')
    outline = ""
    for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        heading_text = heading.text.strip()
        if heading.name in ["h1", "h2"]:
            heading_text = f"*{heading_text}*"
        outline += int(heading.name[1:]) * ' ' + '- ' + heading_text + '\n'
    article.nlp()
    keywords = article.keywords
    tags = ""
    for keyword in keywords:
        tags += " #" + keyword
    summary = article.summary
    summary_points = ""
    for x in summary.splitlines():
        summary_points += "↦️ " + x + "\n"
    summary = summary_points
    read = readtime.of_text(text)
    msg = f"""🔗 *Link:* {link}\n{author}{date}\n🚩 *Title: {title}*\n\n🗨 *Summary:*\n _{summary}_\n"""
    msg += f"""🎋 *Outline: * \n{outline}\n"""
    msg += f"""🤔 *Reading Time:* {read}\n""".replace("min", "mins")
    msg += f"""📑 *Tags:* {tags}\n """
    query = urllib.parse.quote(msg.replace('*', '**').replace('_', '__'))
    share_url = 'tg://msg_url?url=' + query
    button_list = [
        InlineKeyboardButton('Add to reading list', callback_data=1),
        InlineKeyboardButton("📬 Share", url=share_url)
    ]
    reply_markup = InlineKeyboardMarkup(build_menu(button_list, n_cols=2))
    update.message.reply_text(
        msg, parse_mode=telegram.ParseMode.MARKDOWN, reply_markup=reply_markup)

    if update.message.chat_id != ADMIN:
        context.bot.send_message(chat_id="{}".format(ADMIN),
                                 text='{}'.format(
                                     update.message.from_user.first_name + " *sent:*\n" + msg),
                                 parse_mode=telegram.ParseMode.MARKDOWN)
Exemple #50
0
    def post(self):
        try:
            data = json.loads(self.request.body.decode('utf-8'))
        except Exception:
            data = self.get_argument('data')
            data = json.loads(data)

        action = data.get('action')

        if action == 'load_page':
            email = self.current_user.decode('utf-8')

            id = get_user(email)
            id = id[0][0]

            full_summarization = get_mysummary(id)

            full = []

            for data in full_summarization:
                full.append({
                    'title': data[0],
                    'link': data[1],
                    'photo': data[2],
                    'keywords': data[3],
                    'summary': data[4],
                    'date': data[5]
                })

            full.reverse()

            number = get_rowsummary(id)

            self.write(json.dumps({'summary': full, 'number': number[0][0]}))
        elif action == 'summary':
            try:
                url = json.loads(self.request.body.decode('utf-8'))
            except Exception:
                url = self.get_argument('data')
                url = json.loads(url)

            try:
                url = url.get('url')
                url = re.sub(' ', '', url)

                email = self.current_user.decode('utf-8')

                article = Article(url, language='en')
                article.download()
                article.parse()

                title = article.title

                if detect(title) != 'en' or detect(article.text) != 'en':
                    self.write(
                        json.dumps(
                            {'result':
                             'This language will be supported soon'}))
                else:
                    try:
                        image = article.top_image
                    except Exception:
                        image = ''

                    article.nlp()

                    try:
                        keywords = article.keywords
                        keywords = ','.join(keywords)
                    except Exception:
                        keywords = 'Sorry,no,keywords,found'

                    try:
                        summary = article.summary
                        summary = '<p style = "margin: 10px 0px 10px 0px">' + re.sub(
                            r'\.',
                            r'.</p><p style = "margin: 10px 0px 10px 0px">',
                            summary)
                        summary = summary[:-40]
                    except Exception:
                        summary = 'Sorry, no summmary found'

                    try:
                        publish_date = article.publish_date
                        publish_date = publish_date.date()
                    except Exception:
                        publish_date = 'XII b.c.'

                    if url[-1] == '/':
                        summarized = {
                            'title': title,
                            'link': url,
                            'photo': image,
                            'keywords': keywords,
                            'summary': str(summary),
                            'date': str(publish_date)
                        }
                    else:
                        url = url + '/'

                        summarized = {
                            'title': title,
                            'link': url,
                            'photo': image,
                            'keywords': keywords,
                            'summary': str(summary),
                            'date': str(publish_date)
                        }

                    id = get_user(email)
                    id = id[0][0]

                    result = main_summarization(summarized, id)

                    summarized = {
                        'title': title,
                        'link': url,
                        'photo': image,
                        'keywords': keywords,
                        'summary': str(summary),
                        'date': str(publish_date)
                    }

                    if result == 'You have this result':
                        jsn = {'result': result}
                        self.write(json.dumps(jsn))
                    else:
                        jsn = {
                            'summary': summarized,
                            'number': result,
                            'result': 'done'
                        }

                        self.write(json.dumps(jsn))
            except Exception:
                jsn = {'result': 'This URL is unsummarizable'}
                self.write(json.dumps(jsn))

        elif action == 'delete':
            try:
                url = json.loads(self.request.body.decode('utf-8'))
            except Exception:
                url = self.get_argument('data')
                url = json.loads(url)

            url = url.get('url')

            email = self.current_user.decode('utf-8')

            id = get_user(email)
            id = id[0][0]

            main_delete(id, url)

            self.write(json.dumps({'result': 'done'}))
    def _retrieve_data(self):

        if self.config.dataproducer.pull_data:
            print("Pull data...")
            append_data = []

            page_count = 0
            with urllib.request.urlopen(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=01012021-03312021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=1') as url:
                data = json.loads(url.read().decode())
                page_count = int(data["total_pages"])

            print(page_count)
            for i in range(1, page_count):
                print(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=01012021-03312021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=' + str(i))
                df = pd.read_json(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=01012021-03312021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=' + str(i))
                append_data.append(df)

            with urllib.request.urlopen(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=04012021-06302021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=1') as url:
                data = json.loads(url.read().decode())
                page_count = int(data["total_pages"])

            print(page_count)
            for i in range(1, page_count):
                df = pd.read_json(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=04012021-06302021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=' + str(i))
                append_data.append(df)

            with urllib.request.urlopen(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=07012021-09312021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=1') as url:
                data = json.loads(url.read().decode())
                page_count = int(data["total_pages"])

            print(page_count)
            for i in range(1, page_count):
                df = pd.read_json(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=07012021-09312021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=' + str(i))
                append_data.append(df)

            with urllib.request.urlopen(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=10012021-11052021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=1') as url:
                data = json.loads(url.read().decode())
                page_count = int(data["total_pages"])

            print(page_count)
            for i in range(1, page_count):
                df = pd.read_json(
                    'https://cryptonews-api.com/api/v1?tickers=BTC&date=10012021-11052021&items=50&token='
                    + self.config.dataproducer.apikey + '&page=' + str(i))
                append_data.append(df)

            df = pd.concat(append_data)
            df.to_pickle('./data/raw/corpus.pkl')
            df = pd.json_normalize(df['data'])
            df.rename(columns={'date': 'datetime'}, inplace=True)
            df['date'] = pd.to_datetime(df['datetime'],
                                        format='%a, %d %b %Y %H:%M:%S %z',
                                        utc=True)
            df['date'] = df['date'].dt.date.astype('datetime64')
            df.drop(['image_url', 'topics', 'tickers'], axis=1, inplace=True)
            df = df[df.type.isin(['Article'])]

            nltk.download('punkt')

            user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
            config = Config()
            config.browser_user_agent = user_agent

            list = []
            for ind in df.index:
                dict = {}
                #print(df_all['news_url'][ind])
                article = Article(df['news_url'][ind], config=config)
                article.download()
                try:
                    article.parse()
                    article.nlp()
                    dict['datetime'] = df['datetime'][ind]
                    dict['date'] = df['date'][ind]
                    dict['news_url'] = df['news_url'][ind]
                    dict['title'] = df['title'][ind]
                    dict['text'] = df['text'][ind]
                    dict['source_name'] = df['source_name'][ind]
                    dict['sentiment'] = df['sentiment'][ind]
                    dict['type'] = df['type'][ind]
                    dict['article_title'] = article.title
                    dict['article_text'] = article.text
                    dict['article_summary'] = article.summary
                    list.append(dict)
                except:
                    pass

            full_df = pd.DataFrame(list)
            full_df.to_pickle("./data/raw/news_corpus_110521.pkl")
        else:
            print('Read saved data...')
            full_df = pd.read_pickle('./data/raw/news_corpus_110521.pkl')

        return full_df
Exemple #52
0
def parse_article(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()
    gen_article_dictionary(article)
print(article.html)

article.parse()

article.authors

article.publish_date

article.text

article.top_image

article.movies

print(article.nlp())

article.keywords

print(article.summary)

import newspaper

elpais = newspaper.build('http://www.elpais.com')
print("number of articles in elpais")
print(len(elpais.articles))

# for article in cnn_paper.articles:
#     print(article.url)

# for category in cnn_paper.category_urls():
# In[4]:


nltk.download('punkt', quiet=True) # Download the punkt package
nltk.download('wordnet', quiet=True)


# In[5]:


#Get the article URL
article = Article('https://www.medicalnewstoday.com/articles/256521')
article.download() #Download the article
article.parse() #Parse the article
article.nlp() #Apply Natural Language Processing (NLP)
corpus = article.text


# In[6]:


print(corpus)


# In[7]:


text = corpus
sent_tokens = nltk.sent_tokenize(text)
Exemple #55
0
def response(user_response):
    bot_response = ''
    questions = [
        "berätta om björnar?", "berätta om katter?", "berätta om hundar?"
    ]  # Frågor som AI:n utgår från för att avgöra vad användaren frågar efter.
    articles = [
        'https://sv.wikipedia.org/wiki/Bj%C3%B6rnar',
        'https://sv.wikipedia.org/wiki/Katt',
        'https://sv.wikipedia.org/wiki/Hund'
    ]  # Länkar som AI:n hämtar information från för att svara på användarens frågor.
    results = []
    index = 0
    for question in questions:
        Question_similarity = SequenceMatcher(
            a=questions[index], b=user_response
        ).ratio(
        )  # Avgör hur lik användarens fråga är de frågor som AI:n utgår ifrån.
        results.append(
            Question_similarity
        )  # Resultaten för hur lik frågan är läggs till i en lista.
        index += 1
    results_sorted = sorted(results, key=None, reverse=True)
    index = 0
    for result in results:  # Går igenom alla resultat och kollar om det är det bästa resultatet.
        if result == results_sorted[0]:
            article = Article(
                articles[index]
            )  # Bestämmer vilken länk där information som bäst svarar på användarens fråga finns.
            article.download()
            article.parse()
            article.nlp()
            text = article.text

            sent_tokens = nltk.sent_tokenize(
                text)  #Konverterar artikelns text till en lista med meningar.
            remove_punct_dict = dict(
                (ord(punct), None) for punct in string.punctuation)
            break
        else:
            index += 1

    sent_tokens.append(user_response)

    TfidVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidVec.fit_transform(sent_tokens)
    vals = cosine_similarity(
        tfidf[-1], tfidf
    )  # Bestämmer hur lik användarens input är de olika meningarna i artikeln.
    idx = vals.argsort()[0][
        -2]  # Bestämmer meningen som är mest lik användarens input.
    flat = vals.flatten()
    flat.sort()
    score = flat[-2]

    if score < 0.1:
        bot_response = bot_response + "Jag förstår tyvärr inte."
    else:
        bot_response = bot_response + sent_tokens[
            idx]  # Om "score" är större än 0.1 svarar boten på användarens fråga."
    sent_tokens.remove(user_response)
    return bot_response
Exemple #56
0
def root_from(root_urls, target_dir, delay=0.2):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # This function is based on https://github.com/heximhotep/fakenews_scraper
    # it starts with a list of root urls
    # for each root url it gets other urls on that page and forks in this way,
    # scraping everything on the road

    # use this for home news server pages, but not for direct news download
    # you will get a lot of articles, but not all necessarily connected with the original search

    visited_urls = set([])
    saved_articles = set([])
    article_lengths = dict([])

    while (True):
        if (len(root_urls) == 0):
            break
        root_url = root_urls[0]
        root_urls = root_urls[1:]
        # print(root_urls)
        if (root_url in visited_urls):
            continue
        else:
            visited_urls.add(root_url)
        root_paper = newspaper.build(root_url)
        print(root_url, 'size:', root_paper.size())
        print('category urls count:', len(root_paper.category_urls()))
        adjacent_urls = root_paper.category_urls()
        for adj_url in adjacent_urls:
            if (adj_url in visited_urls):
                continue
            root_urls.append(adj_url)
            # print(root_urls)
        index = 0
        visited_streak = 0
        for carticle in root_paper.articles:
            if (visited_streak > 26):
                break
            article = Article(carticle.url)
            try:
                article.download()
                article.parse()

                article_name = fileName(
                    None if article.authors == [] else article.authors[0],
                    article.title, article.publish_date)

                if (article_name in saved_articles and
                        len(article.text) <= article_lengths[article_name]):
                    print('skipping article')
                    visited_streak += 1
                    continue
                visited_streak = 0
                article.nlp()
                saved_articles.add(article_name)
                article_lengths[article_name] = (len(article.text))
                payload = {
                    "url": article.url,
                    "title": article.title,
                    "content": article.text
                }
                features = {
                    "content": {
                        "keywords": [{
                            "keyword": word
                        } for word in article.keywords]
                    }
                }

                articleJSON = {
                    "features": features,
                    "url": article.url,
                    "date": article.publish_date,
                    "title": article.title,
                    "authors": article.authors,
                    "body": article.text
                }
                with open(target_dir + "/" + article_name, 'w') as outfile:
                    json.dump(articleJSON, outfile, indent=2, default=str)
                    print("saved article")
            except ArticleException:
                continue
            except FileNotFoundError:
                continue
            except OSError:
                continue
            except UnicodeError:
                continue
            except Exception:
                continue
            index += 1
        print(index)
    time.sleep(delay)
class ArticleTestCase(unittest.TestCase):
    def setup_stage(self, stage_name):
        stages = OrderedDict([
            ('initial', lambda: None),
            ('download', lambda: self.article.download(
                mock_resource_with('cnn_article', 'html'))),
            ('parse', lambda: self.article.parse()),
            ('meta', lambda: None),  # Alias for nlp
            ('nlp', lambda: self.article.nlp())
        ])
        assert stage_name in stages
        for name, action in stages.items():
            if name == stage_name:
                break
            action()

    def setUp(self):
        """Called before the first test case of this unit begins
        """
        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch')

    @print_test
    def test_url(self):
        self.assertEqual(
            'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch', self.article.url)

    @print_test
    def test_download_html(self):
        self.setup_stage('download')
        html = mock_resource_with('cnn_article', 'html')
        self.article.download(html)
        self.assertEqual(75406, len(self.article.html))

    @print_test
    def test_meta_refresh_redirect(self):
        # TODO: We actually hit example.com in this unit test ... which is bad
        # Figure out how to mock an actual redirect
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article('', config=config)
        html = mock_resource_with('google_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'Example Domain')

    @print_test
    def test_meta_refresh_no_url_redirect(self):
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article('', config=config)
        html = mock_resource_with('ap_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'News from The Associated Press')

    @print_test
    def test_pre_download_parse(self):
        """Calling `parse()` before `download()` should yield an error
        """
        article = Article(self.article.url)
        self.assertRaises(ArticleException, article.parse)

    @print_test
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = [
            'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
            'Tom Watkins'
        ]
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.assertEqual(text, self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))

    @print_test
    def test_meta_type_extraction(self):
        self.setup_stage('meta')
        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        self.assertEqual('article', meta_type)

    @print_test
    def test_meta_extraction(self):
        self.setup_stage('meta')
        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(
            dict, {
                'medium':
                'news',
                'googlebot':
                'noarchive',
                'pubdate':
                '2013-11-27T08:36:32Z',
                'title':
                'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
                'og': {
                    'site_name': 'CNN',
                    'description':
                    'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.',
                    'title':
                    'After storm, forecasters see smooth sailing for Thanksgiving',
                    'url':
                    'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                    'image':
                    'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg',
                    'type': 'article'
                },
                'section':
                'travel',
                'author':
                'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN',
                'robots':
                'index,follow',
                'vr': {
                    'canonical':
                    'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'
                },
                'source':
                'CNN',
                'fb': {
                    'page_id': 18793419640,
                    'app_id': 80401312489
                },
                'keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
                'article': {
                    'publisher': 'https://www.facebook.com/cnninternational'
                },
                'lastmod':
                '2013-11-28T02:03:23Z',
                'twitter': {
                    'site': {
                        'identifier': '@CNNI',
                        'id': 2097571
                    },
                    'card': 'summary',
                    'creator': {
                        'identifier': '@cnntravel',
                        'id': 174377718
                    }
                },
                'viewport':
                'width=1024',
                'news_keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
            })

        self.assertDictEqual(META_DATA, meta)

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = [v for v in list(meta.values()) if isinstance(v, dict)]
        self.assertTrue(all([len(d) > 0 for d in dict_values]))

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        self.assertEqual(5, len([i for i in meta.values() if is_dict(i)]))

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, str)
        self.assertEqual(12, len([i for i in meta.values() if is_string(i)]))

    @print_test
    def test_pre_download_nlp(self):
        """Test running NLP algos before even downloading the article
        """
        self.setup_stage('initial')
        new_article = Article(self.article.url)
        self.assertRaises(ArticleException, new_article.nlp)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLP algos before parsing the article
        """
        self.setup_stage('parse')
        self.assertRaises(ArticleException, self.article.nlp)

    @print_test
    def test_nlp_body(self):
        self.setup_stage('nlp')
        self.article.nlp()
        KEYWORDS = [
            'balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing',
            'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds',
            'york'
        ]
        SUMMARY = mock_resource_with('cnn_summary', 'txt')
        self.assertEqual(SUMMARY, self.article.summary)
        self.assertCountEqual(KEYWORDS, self.article.keywords)
Exemple #58
0
def recommend_article(article_feed_url, tag):
    """
    Takes a URL with articles as well as a tag and returns the recommended article title 
    (for now) from this URL based on the tag. Currently selects the article with highest
    tag occurrences in its main text. 
    :param article_feed_url: A URL
    :param tag: String
    :rtype: String
    """

    list_of_article_titles = []
    all_text = []
    key_words = []
    english_check = re.compile(r'[a-z]')
    if english_check.match(tag):  # english
        print("This is a english website")
        if valid_url(article_feed_url) and url_is_alive(article_feed_url):
            article_urls = get_article_links(article_feed_url)
            try:
                for article_url in article_urls:
                    if valid_url(article_url) == False or url_is_alive(
                            article_url) == False:
                        continue
                    cur_article = Article(article_url, language='zh')
                    cur_article.download()
                    cur_article.parse()
                    list_of_article_titles.append(cur_article.title)
                    tag_frequency.append(cur_article.text.lower().count(tag))
                    all_text.append(cur_article.text.lower())
                print("there are in total of {0} articles collected".format(
                    len(list_of_article_titles)))
            except:
                print(
                    "download limit exceeded... but the result so far is returned..."
                )
                print("there are in total of ", len(list_of_article_titles),
                      ' articles collected')
        else:
            return 'Bad URL'

    else:  # chinese
        print("this is a chinese website")
        soup = simple_get(article_feed_url)
        try:
            for article in soup.findAll('a', href=True):
                if article.text and article['href'] and len(
                        article.text.replace(' ', '')) >= 15:
                    cur_article = Article(article_feed_url +
                                          article['href'][1:],
                                          language='zh')
                    cur_article.download()
                    cur_article.parse()
                    cur_article.nlp()
                    list_of_article_titles.append(cur_article.title)
                    all_text.append(cur_article.text.lower())
                    key_words.append(cur_article.keywords)

            print("there are in total of ", len(list_of_article_titles),
                  ' articles collected')
            print("These are the titles of found articles: ",
                  list_of_article_titles)

        except:
            print(
                "download limit exceeded... but the result so far is returned..."
            )
            print("there are in total of ", len(list_of_article_titles),
                  ' articles collected')

        if not all_text:
            return None

    # create vector representation of our articles
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(all_text)

    # create binary target variable (whether or not the tag is in the keywords)
    y = []
    for keywords in key_words:
        check = False
        for keyword in keywords:
            if tag in keyword:
                check = True
                break
        if check:
            y.append(1)
        else:
            y.append(0)

    # build logistic regression model to find article with highest probability
    clf = LogisticRegression().fit(X, y)
    article_probs = clf.predict_proba(X)[:, 1]
    return list_of_article_titles[np.argmax(article_probs)]
Exemple #59
0
    def post(self):
        data = json.loads(self.request.body.decode('utf-8'))
        # data = {}
        # tornado.httputil.parse_body_arguments(self.request.headers["Content-Type"], self.request.body, data)
        #
        # logging.getLogger().debug("args={}".format(data))

        action = data.get('action')
        # action = action[0]
        # action = action.decode('utf-8')
        email = data.get('email')
        # email = email[0]
        # email = email.decode('utf-8')

        if action == 'load_page':
            id = get_user(email)
            id = id[0][0]

            full_summarization = get_mysummary(id)

            full = []

            for data in full_summarization:
                send_summary = ''
                print(data[4])
                for i in range(3):
                    number = i + 1
                    send_summary = send_summary + '<p style = "margin: 10px 0px 10px 0px">' + data[
                        4].split(
                            '<p style = "margin: 10px 0px 10px 0px">')[number]
                    print(send_summary)

                full.append({
                    'title': data[0],
                    'link': data[1],
                    'photo': data[2],
                    'keywords': data[3],
                    'summary': send_summary,
                    'date': data[5]
                })

            full.reverse()

            self.write(json.dumps({'summary': full}))
        elif action == 'summary':
            try:
                url = data.get('url')
                # url = url[0]
                # url = url.decode('utf-8')

                url = re.sub(' ', '', url)

                article = Article(url, language='en')
                article.download()
                article.parse()

                title = article.title

                if detect(title) != 'en' or detect(article.text) != 'en':
                    self.write(
                        json.dumps(
                            {'result':
                             'This language will be supported soon'}))
                else:
                    try:
                        image = article.top_image
                    except Exception:
                        image = ''

                    article.nlp()

                    try:
                        keywords = article.keywords
                        keywords = ','.join(keywords)
                    except Exception:
                        keywords = 'Sorry,no,keywords,found'

                    try:
                        summary = article.summary
                        summary = '<p style = "margin: 10px 0px 10px 0px">' + re.sub(
                            r'\.',
                            r'.</p><p style = "margin: 10px 0px 10px 0px">',
                            summary)
                        summary = summary[:-40]
                    except Exception:
                        summary = 'Sorry, no summmary found'

                    try:
                        publish_date = article.publish_date
                        publish_date = publish_date.date()
                    except Exception:
                        publish_date = 'XII b.c.'

                    if url[-1] == '/':
                        summarized = {
                            'title': title,
                            'link': url,
                            'photo': image,
                            'keywords': keywords,
                            'summary': str(summary),
                            'date': str(publish_date)
                        }
                    else:
                        url = url + '/'

                        summarized = {
                            'title': title,
                            'link': url,
                            'photo': image,
                            'keywords': keywords,
                            'summary': str(summary),
                            'date': str(publish_date)
                        }

                    id = get_user(email)
                    id = id[0][0]

                    result = main_summarization(summarized, id)

                    send_summary = ''
                    for i in range(3):
                        number = i + 1
                        send_summary = send_summary + '<p style = "margin: 10px 0px 10px 0px">' + summary.split(
                            '<p style = "margin: 10px 0px 10px 0px">')[number]

                    summarized = {
                        'title': title,
                        'link': url,
                        'photo': image,
                        'keywords': keywords,
                        'summary': send_summary,
                        'date': str(publish_date)
                    }

                    if result == 'You have this result':
                        jsn = {'result': result}
                        self.write(json.dumps(jsn))
                    else:

                        self.write(json.dumps(summarized))
            except Exception:
                jsn = {'result': 'This URL is unsummarizable'}
                self.write(json.dumps(jsn))

        elif action == 'delete':
            url = data.get('url')

            id = get_user(email)
            id = id[0][0]

            main_delete(id, url)

            self.write(json.dumps({'result': 'done'}))
    def get_stocknews_byticker(self,
                               tickersList,
                               nitems=50,
                               daysback=30,
                               sortby='trending'):
        assert (sortby in ['trending', 'algo'])

        tickers = str(tickersList).replace('[', '').replace(']', '').replace(
            "'", '').replace(' ', '')
        urlInstructions = {
            'ticker':
            tickers,
            'nitems':
            nitems,
            'fromdate_MMDDYYYY':
            (date.today() -
             datetime.timedelta(days=daysback)).strftime('%m%d%Y'),
            'sortby':
            sortby,
            'today':
            date.today(),
        }
        outfileName = 'Finance/temp/{ticker}-{nitems}-{fromdate_MMDDYYYY}-{sortby}-{today}.json'.format(
            **urlInstructions)

        text = self.bqu.read_string_from_gcp(self.bucketName, outfileName)
        if text is None:
            url = self.stocknews_url_template.format(**urlInstructions)
            print(url)
            response = requests.request("GET", url)
            text = response.text
            self.bqu.upload_string_to_gcp(response.text, self.bucketName,
                                          outfileName)

        data = json.loads(text)

        newsDict = data['data']

        sentimentDict = {
            'Count': 0,
            'Negative': 0,
            'Positive': 0,
            'Neutral': 0,
            'Weighted': 0
        }
        sentimentWeight = {'Negative': -1, 'Positive': 1, 'Neutral': 0}
        count = 0
        newsFeed = []
        startTime = dt.utcnow()
        for newsItem in newsDict:
            count += 1
            newItem = {
                key: newsItem[key]
                for key in [
                    'title', 'news_url', 'text', 'sentiment', 'source_name',
                    'topics'
                ]
            }
            newItem['index'] = count
            itemDate = dt.strptime(newsItem['date'],
                                   '%a, %d %b %Y %H:%M:%S %z')
            delta = startTime.date() - itemDate.date()
            if delta.days <= 3 or count <= 3:
                newItem['date'] = str(itemDate.date())
                if False:  # suspend getting the summary
                    article = Article(newItem['news_url'])
                    # Do some NLP
                    try:
                        article.download()  # Downloads the link’s HTML content
                        article.parse()  # Parse the article
                        article.nlp()  # Keyword extraction wrapper
                        newItem['Summary'] = article.summary.replace(
                            '\n', '\n')
                    except Exception as e:
                        print('Error occured:', e)
                        newItem['Summary'] = "<...>"

                #print(newItem['Summary'])
                newsFeed.append(newItem)
            if delta.days <= 3:
                deltaWeight = 1
            elif delta.days <= 7:
                deltaWeight = 0.5
            elif delta.days <= 14:
                deltaWeight = 0.25
            elif delta.days <= 30:
                deltaWeight = 0.125
            else:
                deltaWeight = 0.05

            sentiment = newsItem['sentiment']
            sentimentDict[sentiment] += 1
            sentimentDict['Count'] += 1
            sentimentDict[
                'Weighted'] += sentimentWeight[sentiment] * deltaWeight
        retDict = {
            'NumItems': len(newsFeed),
            'Sentiment': sentimentDict,
            'Newsfeed': newsFeed,
        }

        return retDict