Exemple #1
0
def craft_newsletter():
    '''
    Craft the newsletter. Returns JSON.
    :return: the newsletter json
    '''

    a = Articles(API_KEY=os.environ["NEWSAPI_KEY"])
    top_results = a.get_by_top(source="google-news")

    breaking = requests.get("https://librenews.io/api").json()["latest"]

    period = "AM"
    greeting = "It's 5:30 ZULU time."

    if datetime.datetime.now(tz=None).time() > datetime.time(12):
        period = "PM"
        greeting = "It's 17:30 ZULU time."

    name = period + " - " + datetime.date.today().strftime("%A, %d %B %Y")

    for story in top_results["articles"]:
        story["source"] = urlparse(story["url"]).netloc

    return {
        "top_stories":
        top_results["articles"][:3],
        "breaking":
        [story for story in breaking
         if story["channel"] == "Breaking News"][:5],
        "name":
        name,
        "greeting":
        greeting
    }
def news():
    with open('X.pkl', 'rb') as f:
        X = pickle.load(f)
    with open('y.pkl', 'rb') as f:
        y = pickle.load(f)


#Generating the training and testing dataset

    count_vectorizer = CountVectorizer()
    X = count_vectorizer.fit_transform(X)  # Fit the Data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.0,
                                                        random_state=42)
    #Naive Bayes Classifier
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    apikey = 'c9c0b7a1fc944a02bdadda8c09dace91'
    a = Articles(API_KEY=apikey)
    data = a.get(source="abc-news-au", sort_by='top')
    data = pd.DataFrame.from_dict(data)
    data = pd.concat(
        [data.drop(['articles'], axis=1), data['articles'].apply(pd.Series)],
        axis=1)
    description = data['description']

    def pre(x):
        data1 = str(x)
        data1 = remove_new_lines(data1)
        data1 = remove_stop_words(data1)
        data1 = strip(data1)
        data1 = remove_weird(data1)
        data1 = np.array(data1).reshape(-1)
        vect = count_vectorizer.transform(data1)
        my_prediction1 = clf.predict(vect)
        return my_prediction1

    pred0 = pre(description[0])
    pred1 = pre(description[1])
    pred2 = pre(description[2])
    pred3 = pre(description[3])
    pred4 = pre(description[4])

    return render_template('news.html',
                           des0=description[0],
                           des1=description[1],
                           des2=description[2],
                           des3=description[3],
                           des4=description[4],
                           pred0=pred0,
                           pred1=pred1,
                           pred2=pred2,
                           pred3=pred3,
                           pred4=pred4)
Exemple #3
0
 def __init__(self, *args):
     super(ReporterModule, self).__init__(*args)
     self.API_KEY = self.get_configuration("newsapi.org_key")
     self.threshold = int(self.get_configuration("news_limit"))
     if self.API_KEY:
         self.articles = Articles(self.API_KEY)
         self.sources = Sources(self.API_KEY)
     else:
         print(_("error.news.configuration"))
         return False
     self.sources_url = {}
     self.sources.information()
Exemple #4
0
def listOfArticles():
    sourceList = listOfSources()
    articleList = []

    a = Articles(API_KEY="40e40820d389493abb369f099605fec3")

    for source in sourceList:
        try:
            b = a.get_by_latest(source=source)
            articleList.extend(b['articles'])
        except:
            pass
    return articleList
Exemple #5
0
 def __init__(self, *args):
     super(ReporterModule, self).__init__(*args)
     self.API_KEY = self.get_configuration("newsapi.org_key")
     self.threshold = int(self.get_configuration("news_limit"))
     if self.API_KEY:
         self.articles = Articles(self.API_KEY)
         self.sources = Sources(self.API_KEY)
     else:
         print(
             "Kindly look back at the documentation to configure news module properly especially the API keys."
         )
         return False
     self.sources_url = {}
     self.sources.information()
Exemple #6
0
def get_news(sources):
    NEWS_API_KEY='1bae2e39f2b540f3a15dbbcb269eba9b'
    articles=Articles(API_KEY=NEWS_API_KEY)
    info=articles.get(source=sources)
    news_array=[]
    news_objects=[]
    length_of_articles=len(info['articles'])
    for i in range(0,int(length_of_articles)):
        headline=info['articles'][i]['title']
        body=info['articles'][i]['description']
        url_web=info['articles'][i]['url']
        image=info['articles'][i]['urlToImage']
        time=info['articles'][i]['publishedAt']
        news_objects.append(Template.GenericElement(title=headline,subtitle=body,item_url=url_web,
        image_url=image,buttons=[Template.ButtonWeb(title='Open in web',url=url_web)]))
    return news_objects
Exemple #7
0
class Scraper:

    # example code
    # -----------------------
    # x = Scraper(api_key='xyz')
    # print(x.scrape_all_articles(language='en'))

    articles = None
    sources = None
    api_key = None

    def __init__(self, api_key) -> None:
        super().__init__()
        self.api_key = api_key
        self.articles = Articles(API_KEY=self.api_key)
        self.sources = Sources(API_KEY=self.api_key)

    def scrape_articles_for_sources(self, sources):
        '''
        Accepts the list of source names and returns all articles downloaded from the given sources
        :param sources: List of source id's
        :return: List of article json objects, containing:
            'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt'
        '''
        articles = []
        for source in sources:
            try:
                # list of json objects
                # author, title, description, url, urlToImage, publishedAt
                articles_for_source = self.articles.get(source=source).articles
            except BaseException:  # if the server does not respond
                continue
            for article in articles_for_source:
                articles.append(article)
        return articles

    def scrape_sources(self, categories=[], language=None):
        '''
        Gets the newsapi sources associated with the given category (optional) and language (optional)
        :param categories: List of categories (optional)
        :param language: Language (optional)
        :return: List of source id's
        '''
        sources_dict = []
        for category in categories:
            sources_dict += self.sources.get(category, language).sources
        sources = set([source['id'] for source in sources_dict])
        return sources

    def scrape_all_articles(self, categories=[], language=None):
        '''
        Scrapes and returns all articles for the given category and language (parameters are optional)
        :param categories: list of categories (optional)
        :param language: language (optional)
        :return: List of article json objects, containing:
            'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt'
        '''
        return self.scrape_articles_for_sources(
            self.scrape_sources(categories, language))
Exemple #8
0
class ReporterModule(BaseModule):
    AFFIRMATIVE = ["YES", "YEAH", "SURE", "YAH", "YA"]
    NEGATIVE = ["NO", "NEGATIVE", "NAH", "NA", "NOPE"]

    def __init__(self, *args):
        super(ReporterModule, self).__init__(*args)
        self.API_KEY = self.get_configuration("newsapi.org_key")
        self.threshold = int(self.get_configuration("news_limit"))
        if self.API_KEY:
            self.articles = Articles(self.API_KEY)
            self.sources = Sources(self.API_KEY)
        else:
            print(
                "Kindly look back at the documentation to configure news module properly especially the API keys."
            )
            return False
        self.sources_url = {}
        self.sources.information()

    def get_all_categories(self):
        return list(self.sources.all_categories())

    def get_by_category(self, category):
        srcs = self.sources.get_by_category(category).sources
        self.sources_url = {}
        for src in srcs:
            self.sources_url[src['name']] = src['url']
        return self.sources_url

    def get_sort_bys_of_source(self, source_name):
        return self.sources.search(source_name)[0]['sortBysAvailable']

    def all_sources(self):
        self.sources_url = self.sources.all_names()
        return self.sources_url

    def get_news(self):
        self.assistant.say(
            "Would you prefer any specific category? If yes then what would it be?"
        )
        category_status = self.assistant.listen().decipher()
        if category_status.upper() in self.NEGATIVE:
            category = False
        else:
            categories = self.get_all_categories()
            category = self.search(categories, category_status)
        self.assistant.say(
            "Any preference you would like to have about source of your news? like CNN"
            "or Time magazine or maybe The hindu?")
        source_status = self.assistant.listen().decipher()
        if source_status.upper() in self.NEGATIVE:
            source = False
        else:
            if category:
                sources_available = self.get_by_category(category)
                response = "Out of all the sources as follows"
                for source_name, source_url in sources_available.items():
                    response += " %s," % source_name
                response += ", which one would you like to pick?"
                self.assistant.say(response)
                source_command = self.assistant.listen().decipher()
                source = self.search(list(sources_available), source_command)
            else:
                self.assistant.say(
                    "So would you want me to list all the sources around 70 which to be"
                    "honest would be a hefty task, so if not, then just let me know of"
                    "your source name and I would let you know if it's available or not."
                )
                all_sources_status = self.assistant.listen().decipher()
                sources_available = self.all_sources()
                if all_sources_status.upper() in self.AFFIRMATIVE:
                    response = "Good job, lazy ass, so here are all the available sources as follows "
                    sources_available_list = list(sources_available)
                    for source_name in sources_available_list:
                        response += " %s," % source_name
                    response += ", which one would you like to pick?"
                    self.assistant.say(response)
                    source_command = self.assistant.listen().decipher()
                    all_sources_status = source_command
                source_found = self.search(list(sources_available),
                                           all_sources_status)
                source = source_found
        if source:
            sort_bys_available = self.get_sort_bys_of_source(source)
            if len(sort_bys_available) == 1:
                sort_by = sort_bys_available[0]
            else:
                if len(sort_bys_available) == 2:
                    response = "And what kind of news sort would you like? " \
                               "%s or %s?" % (sort_bys_available[0], sort_bys_available[1])
                else:
                    response = "And what kind of news sort would you like? " \
                               "%s or %s, or maybe %s?" % (sort_bys_available[0],
                                                           sort_bys_available[1],
                                                           sort_bys_available[2])
                self.assistant.say(response)
                sort_by_command = self.assistant.listen().decipher()
                sort_by = self.search(sort_bys_available, sort_by_command)
        else:
            self.assistant.say("And what kind of news sort would you like?"
                               "latest or maybe top ones shown in front page?")
            sort_status_command = self.assistant.listen().decipher()
            sort_by = self.search(['top', 'popular'
                                   'latest'], sort_status_command)
        if not source:
            if sort_by.lower() == "top":
                source = "google-news"
            elif sort_by.lower() == "latest":
                source = "the-telegraph"
            else:
                source = "time"
        response = self.get_response(source, sort_by)
        return response

    def handle(self):
        source = self.get_configuration("news_source")
        response = self.get_response(source)
        return response

    def get_response(self, source, sort_by=None, threshold=5):
        if self.threshold:
            threshold = self.threshold
        source = source.lower().replace(" ", "-")
        articles = self.articles.get(source, sort_by=sort_by).articles
        articles = articles[:threshold]
        response = "So the %s news from %s news source are as follows " % (
            sort_by, source)
        for article in articles:
            if article['title']:
                response += "%s, " % article['title']
            if article['description']:
                response += "%s, " % article['description']
            if article['author']:
                response += "was reported by %s." % article['author']
            response += "and in the other news. "
        return response

    @staticmethod
    def search(dataset, query):
        values = [0 for _ in range(0, len(dataset))]
        search = query.lower().split()
        upper_threshold = len(search)
        for index, data in enumerate(dataset):
            search_array = data.split()
            for index2, text in enumerate(search_array):
                if index2 >= upper_threshold:
                    break
                threshold = len(search[index2])
                for i in range(0, len(text)):
                    if i >= threshold - 1:
                        break
                    if text[i] == search[index2][i]:
                        values[index] += 1
        max_value = max(values)
        max_index = values.index(max_value)
        return dataset[max_index]
Exemple #9
0
"""
news.py:
Scrapes news sites (urls) specified in util.SOURCES for articles.
"""
import json
import newspaper
import nltk
import datetime
from newsapi.articles import Articles

config = newspaper.Config()
config.memoize_articles = False
with open('api_keys.json') as keyfile:
    NEWSAPI_KEY = json.load(keyfile)['key']
news_container = Articles(API_KEY=NEWSAPI_KEY)


def get_news():
    # TODO: Generalize for an input of different sources or something
    bbc_news = news_container.get_by_top(source="bbc-news")
    wsj = news_container.get_by_top(source="the-wall-street-journal")
    natgeo = news_container.get_by_top(source="national-geographic")
    reuters = news_container.get_by_top(source="reuters")
    nyt = news_container.get_by_top(source="the-new-york-times")

    techcrunch = news_container.get_by_top(source="techcrunch")
    espn = news_container.get_by_top(source="espn")
    independent = news_container.get_by_top(source="independent")
    polygon = news_container.get_by_top(source="polygon")
    time_mag = news_container.get_by_top(source="time")
    huffpost = news_container.get_by_top(source="the-huffington-post")
Exemple #10
0
 def __init__(self, API_KEY):
     self.articles = Articles(API_KEY)
     self.sources = Sources(API_KEY)
     self.sources.information()
import json
import os

import re
import requests
from newsapi.articles import Articles
from newsapi.sources import Sources

API_KEY = "f044f5b63a7c4139858611a1ae6dc5f0"

s = Sources(API_KEY=API_KEY)
a = Articles(API_KEY=API_KEY)

# print(s.information().all_categories())

# print(s.get_by_category("general"))


def get_country_news(country):
    country_id = country + "&"
    url = ('https://newsapi.org/v2/top-headlines?'
           'country=' + country_id + 'apiKey=' + API_KEY)
    response = requests.get(url)
    response = response.json()

    path = os.path.join(os.getcwd(), "posts")
    path = os.path.join(path, "regional_news")
    path = os.path.join(path, country)

    for main_key in response.items():
        if main_key[0] == "articles":
Exemple #12
0
import newsapi
import pandas as pd
from tkinter import *
import tkinter.messagebox

apikey = '*****'

from newsapi.articles import Articles
a = Articles(API_KEY=apikey)


def nw():
    ans = tkinter.messagebox.askquestion("Action", "Want to Update?")

    if ans == "yes":
        print("Updated!")
        tkinter.messagebox.showinfo("Updated", "Updated!")


root = Tk()
tf = Frame(root)
label = Label(tf, text="News Update")
label.pack(side=TOP)


def gr():
    print(var.get())
    data = a.get(source=var.get(), sort_by='top')
    data = pd.DataFrame.from_dict(data)
    data = pd.concat(
        [data.drop(['articles'], axis=1), data['articles'].apply(pd.Series)],
 def __init__(self, api_key=keys.news['api_key']):
     self.api_key = api_key
     self.article = Articles(self.api_key)
     self.source = Sources(self.api_key)
     self.base_url = keys.news['base_everything_url']
     self.logger = logutils.get_logger('News Data Ingestion')
import newsapi
import requests
import json
import os

from newsapi.articles import Articles
from newsapi.sources import Sources

a = Articles(API_KEY="537b165a4f314fedae8cb39788d4d713")
s = Sources(API_KEY="537b165a4f314fedae8cb39788d4d713")

res = a.get(source="daily-mail")['articles']
bbc = a.get(source="bbc-news")['articles']
telegraph = a.get(source="the-telegraph")['articles']
guardian = a.get(source="the-guardian-uk")['articles']
independent = a.get(source="independent")['articles']
sports = a.get(source="the-sport-bible")['articles']

# results = s.get_by_country("gb").sources
# # s.get_by_category("politics")

#resultsString = ''.join(str(e) for e in results)

# filename = 'news_stream.py'

# with open(filename, 'a') as file:
#     for result in independent:
#         print(result['title'])
#         # If you want other things from the tweet object you can specify it here
#         file.write(result['title'] + os.linesep)
Exemple #15
0
from newsapi.articles import Articles
import requests
import json

url = 'https://hooks.slack.com/services/T8N4B1741/B8NPU0hjjjFJR/Bm3J7dDdYFDmKcYtTCwggxsj'

a = Articles(API_KEY="25ead91356d64bf38bff3fc87bcd5d8e")

payload = {
    "attachments": [{
        "title": "The Further Adventures of Slackbot",
        "author_icon":
        "http://a.slack-edge.com/7f18https://a.slack-edge.com/bfaba/img/api/homepage_custom_integrations-2x.png",
        "image_url": "http://i.imgur.com/OJkaVOI.jpg?1"
    }, {
        "title":
        "Headline",
        "text":
        "After @episod pushed exciting changes to a devious new branch back in Issue 1, Slackbot notifies @don about an unexpected deploy..."
    }, {
        "fallback":
        "Read More About it.",
        "title":
        "Read More About it.",
        "callback_id":
        "comic_1234_xyz",
        "color":
        "#3AA3E3",
        "attachment_type":
        "default",
        "actions": [
Exemple #16
0
 def __init__(self, api_key) -> None:
     super().__init__()
     self.api_key = api_key
     self.articles = Articles(API_KEY=self.api_key)
     self.sources = Sources(API_KEY=self.api_key)
Exemple #17
0
from newsapi.articles import Articles
from newsapi.sources import Sources
import secrets


a = Articles(API_KEY=secrets.newapi_api)
s = Sources(API_KEY=secrets.newapi_api)

print(s.informaion())
Exemple #18
0
from newsapi.articles import Articles
# import pymongo
# from pymongo import MongoClient
from cassandra.cluster import Cluster
from cassandra.query import BatchStatement, SimpleStatement
import json
import pprint
import sys

# Key to make requests through API
api = Articles(API_KEY="3e40232f1ea246cb85c76b46bc7543d3")

# The sources selected
sources = ["abc-news-au","al-jazeera-english","ars-technica","associated-press","bbc-news","bbc-sport","bild","bloomberg","breitbart-news","business-insider",
"business-insider-uk","buzzfeed","cnbc","cnn","daily-mail","engadget","entertainment-weekly","espn",
"espn-cric-info","financial-times","focus","football-italia","fortune","four-four-two","fox-sports","google-news","gruenderszene",
"hacker-news","ign","independent","mashable","metro","mirror","mtv-news","mtv-news-uk","national-geographic","new-scientist",
"newsweek","new-york-magazine","nfl-news","polygon","recode","reddit-r-all","reuters","spiegel-online","t3n","talksport","techcrunch",
"techradar","the-economist","the-guardian-au","the-guardian-uk","the-hindu","the-huffington-post","the-lad-bible","the-new-york-times",
"the-sport-bible","the-telegraph","the-times-of-india","the-verge","the-wall-street-journal","the-washington-post",
"time","usa-today","wired-de"]
'''["the-wall-street-journal", "al-jazeera-english", "bbc-news", "bloomberg", "business-insider", "cnbc", "cnn", "daily-mail", 
	"engadget", "espn", "financial-times", "fortune", "fox-sports", "mtv-news", "new-scientist","new-york-magazine","nfl-news","reuters",
	 "talksport","techcrunch","the-economist","the-guardian-uk","the-hindu","the-new-york-times","the-sport-bible","the-times-of-india",
	  "the-verge","the-wall-street-journal","time","usa-today"]'''

#wsj = api.get(source="the-wall-street-journal", sort_by="top")

# wsj_arts_json = json.dumps(wsj.articles,ensure_ascii=False)
# type(wsj_arts_json)
Exemple #19
0
import newsapi
import numpy
import pandas as pd
from newsapi.articles import Articles

apikey = '455e01c84ca44ff387187f10f202bed3'
a = Articles(API_KEY=apikey)
data = a.get(source="the-new-york-times", sort_by='top')

#print (data) ## raw news data

## -----------------------------------------------------------

data = pd.DataFrame.from_dict(data)
data = pd.concat(
    [data.drop(['articles'], axis=1), data['articles'].apply(pd.Series)],
    axis=1)

#data.head()

# drop unused columns
# display only title and discription

news_df = data.drop(columns=[
    'status', 'source', 'sortBy', 'author', 'url', 'urlToImage', 'publishedAt'
])

#print(news_df)

print("---------------------------------------------------------------------")
print("---------------------------------------------------------------------")
Exemple #20
0
import rq
from rq import get_current_job, Queue
from rq.job import Job
import simplify
import json
import requests
import time
import base64
import calendar
import io

utility_referral = ''
# simplify.public_key = "sbpb_Njc3ZDkyMmYtYTE0OS00MTRjLWE5YmUtZjQ3MTI5ZWUzNmE3"
# simplify.private_key = "3KzZq8dCCUhQMh1dTCU6jPrwdG0O4wwwizAP82LcfpN5YFFQL0ODSXAOkNtXTToq"
#ALPHAVANTAGE_API_KEY = "OYL0XNT0O85E76PM"
newsapi = Articles(API_KEY='3c0850b9cd1041989ae33dd295793c51')
job_id = ''
# queue = rq.Queue(connection=Redis.from_url('redis://'))
# queue.delete(delete_jobs=True)
# queue = rq.Queue('tasks', connection=Redis.from_url('redis://'))


@app.route('/')
@app.route('/index')
# @login_required
def index():
    return render_template('index.html')


##################### BEGIN SPRINT 1 CODE ##########################
Exemple #21
0
# -*- coding: utf-8 -*-

"""

"""
About:
Python wrapper for the New York Times Archive API
https://developer.nytimes.com/article_search_v2.json
"""
from newsapi.articles import Articles
import sys,json
import requests

key = '522497f7b4b940b7946eeed6909ed817'
params = {}
api = Articles(API_KEY=key)


class APIKeyException(Exception):
    def __init__(self, message): self.message = message


class InvalidQueryException(Exception):
    def __init__(self, message): self.message = message


class ArchiveAPI(object):
    def __init__(self, key=None):
        self.key = key
        self.root = 'http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'
        if not self.key:
Exemple #22
0
from flask import Flask, jsonify, render_template, request, session, flash, redirect,abort 
from newsapi.articles import Articles
from newsapi.sources import Sources
#from flask.ext.socketio import SocketIO, emit
from sqlalchemy.orm import sessionmaker
import os
from tabledef import *
engine = create_engine('sqlite:///database.db', echo=True)

app = Flask(__name__)
a = Articles(API_KEY="867af1dffb80450b9770b4bcc10c8e14")
s = Sources(API_KEY="867af1dffb80450b9770b4bcc10c8e14")
"""app.config['SECRET_KEY'] = 'secret!'
socketio = SocketIO(app)

@socketio.on('my event')                          # Decorator to catch an event called "my event":
def test_message(message):                        # test_message() is the event callback function.
    emit('my response', {'data': 'got it!'})      # Trigger a new event called "my response" 
 """                                                 # that can be caught by another callback later in the program.

@app.route("/")
def home():
	if session.get('logged_in'):
		return render_template("Welcome.html")
	else: 
		return render_template("login.html")

@app.route('/login',methods=["POST"])
def do_admin_login():
 
    POST_USERNAME = str(request.form['username'])
Exemple #23
0
class News:
    def __init__(self, api_key=keys.news['api_key']):
        self.api_key = api_key
        self.article = Articles(self.api_key)
        self.source = Sources(self.api_key)
        self.base_url = keys.news['base_everything_url']

    def get_data(self,
                 query,
                 from_date=None,
                 to_date=None,
                 page_size=100,
                 sort_by='publishedAt',
                 language='en',
                 **kwargs):
        key_value_params = {
            'apiKey': self.api_key,
            'q': query,
            'from': from_date,
            'to': to_date,
            'sortBy': sort_by,
            'pageSize': page_size,
            'language': language
        }

        url = self._data_config(self.base_url,
                                query_separator='?',
                                key_value_params=key_value_params)
        response = requests.get(
            url,
            headers={
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
            })
        return self._parse_data(response.json())

    @staticmethod
    def _data_config(base_url, **kwargs):
        query_separator = None
        key_value_params = None
        join_sep = '&'
        url = base_url
        if 'query_separator' in kwargs.keys():
            query_separator = kwargs['query_separator']
        if 'key_value_params' in kwargs.keys():
            key_value_params = kwargs['key_value_params']
        if query_separator is not None:
            url = base_url + str(query_separator)
        if key_value_params is not None:
            for key in key_value_params.keys():
                if key_value_params[key] is not None:
                    url = url + str(key) + '=' + str(
                        key_value_params[key]) + join_sep
        return url[:-1]

    @staticmethod
    def _parse_data(news_response_json):
        article_list = list([])
        if news_response_json['status'] == 'ok':
            article_list = news_response_json['articles']
        for article in article_list:
            try:
                article['source'] = article['source']['name']
            except:
                pass
        return article_list

    def get_articles(self, source_id, selection_type="popular"):
        if selection_type == 'latest':
            return self.article.get_by_latest(source_id)
        elif selection_type == 'top':
            return self.article.get_by_top(source_id)
        else:
            return self.article.get_by_popular(source_id)
class NewsDataIngestion(DataIngestionInterface):
    def __init__(self, api_key=keys.news['api_key']):
        self.api_key = api_key
        self.article = Articles(self.api_key)
        self.source = Sources(self.api_key)
        self.base_url = keys.news['base_everything_url']
        self.logger = logutils.get_logger('News Data Ingestion')

    def get_data(self,
                 query,
                 from_date=None,
                 to_date=None,
                 page_size=100,
                 sort_by='publishedAt',
                 language='en',
                 **kwargs):
        key_value_params = {
            'apiKey': self.api_key,
            'q': query,
            'from': from_date,
            'to': to_date,
            'sortBy': sort_by,
            'pageSize': page_size,
            'language': language
        }

        url = self.data_config(self.base_url,
                               query_separator='?',
                               key_value_params=key_value_params)
        response = requests.get(url)
        return response.json()

    def data_config(self, base_url, **kwargs):
        query_separator = None
        key_value_params = None
        join_sep = '&'
        url = base_url
        if 'query_separator' in kwargs.keys():
            query_separator = kwargs['query_separator']
        if 'key_value_params' in kwargs.keys():
            key_value_params = kwargs['key_value_params']
        if query_separator is not None:
            url = base_url + str(query_separator)
        if key_value_params is not None:
            for key in key_value_params.keys():
                if key_value_params[key] is not None:
                    url = url + str(key) + '=' + str(
                        key_value_params[key]) + join_sep
        return url[:-1]

    def store_data(self, data_list, connection_object):
        connection_object.set_collection(constants.NEWS_COLLECTION_NAME)
        for data_dict in data_list:
            try:
                if newsutils.check_duplicate_document(data_dict) is False:
                    connection_object.insert_document(data_dict)
            except:
                self.logger.error('Error while inserting data')
        connection_object.close_connection()

    def parse_data(self, news_response_json, product='default'):
        article_list = list([])
        if news_response_json['status'] == 'ok':
            article_list = news_response_json['articles']
        for article in article_list:
            try:
                article['source'] = article['source']['name']
                article['product'] = product
                article[
                    'human_date'] = newsutils.convert_string_timestamp_to_python_date(
                        article['publishedAt'])
            except:
                self.logger.error("error while parsing data")
        return article_list

    def get_articles(self, source_id, selection_type="popular"):
        if selection_type == 'latest':
            return self.article.get_by_latest(source_id)
        elif selection_type == 'top':
            return self.article.get_by_top(source_id)
        else:
            return self.article.get_by_popular(source_id)

    def data_ingestion_pipeline(self, query, product='default'):
        news_json = self.get_data(query)
        self.logger.info("News data fetched for product %s " % product)
        parsed_news_list = self.parse_data(news_json, product=product)
        self.logger.info("News Data parsed for product %s" % product)
        mongo_connector = dbutils.get_mongodb_connection()
        self.store_data(parsed_news_list, mongo_connector)
        self.logger.info("News data stored for product %s " % product)
Exemple #25
0
 def __init__(self, api_key=keys.news['api_key']):
     self.api_key = api_key
     self.article = Articles(self.api_key)
     self.source = Sources(self.api_key)
     self.base_url = keys.news['base_everything_url']
from newsapi.articles import Articles
from newsapi.sources import Sources
key = '96af62a035db45bda517a9ca62a25ac3'
a, s = Articles(API_KEY=key), Sources(API_KEY=key)
s.all()  # get all sources offered by newsapi

a.get(source='the-new-york-times')
s.get(category='technology', language='en', country='US')

from newsapi import NewsAPI

key = '96af62a035db45bda517a9ca62a25ac3'
params = {}
api = NewsAPI(key)
sources = api.sources(params)
articles = api.articles(sources[0]['id'], params)

################ NY Times API #############################################

import sys, csv, json
reload(sys)
sys.setdefaultencoding('utf8')
"""
About:
Python wrapper for the New York Times Archive API 
https://developer.nytimes.com/article_search_v2.json
"""


class APIKeyException(Exception):
    def __init__(self, message):
Exemple #27
0
class ReporterModule(BaseModule):
    AFFIRMATIVE = ["YES", "YEAH", "SURE", "YAH", "YA"]
    NEGATIVE = ["NO", "NEGATIVE", "NAH", "NA", "NOPE"]

    def __init__(self, *args):
        super(ReporterModule, self).__init__(*args)
        self.API_KEY = self.get_configuration("newsapi.org_key")
        self.threshold = int(self.get_configuration("news_limit"))
        if self.API_KEY:
            self.articles = Articles(self.API_KEY)
            self.sources = Sources(self.API_KEY)
        else:
            print(_("error.news.configuration"))
            return False
        self.sources_url = {}
        self.sources.information()

    def get_all_categories(self):
        return list(self.sources.all_categories())

    def get_by_category(self, category):
        srcs = self.sources.get_by_category(category).sources
        self.sources_url = {}
        for src in srcs:
            self.sources_url[src['name']] = src['url']
        return self.sources_url

    def get_sort_bys_of_source(self, source_name):
        return self.sources.search(source_name)[0]['sortBysAvailable']

    def all_sources(self):
        self.sources_url = self.sources.all_names()
        return self.sources_url

    def get_news(self):
        self.assistant.say(_("news.category.ask"))
        category_status = self.assistant.listen().decipher()
        if category_status.upper() in self.NEGATIVE:
            category = False
        else:
            categories = self.get_all_categories()
            category = self.search(categories, category_status)
        self.assistant.say(_("news.sources.ask"))
        source_status = self.assistant.listen().decipher()
        if source_status.upper() in self.NEGATIVE:
            source = False
        else:
            if category:
                sources_available = self.get_by_category(category)
                response = _("news.sources.list")
                for source_name, source_url in sources_available.items():
                    response += " %s," % source_name
                response += _("news.sources.select")
                self.assistant.say(response)
                source_command = self.assistant.listen().decipher()
                source = self.search(list(sources_available), source_command)
            else:
                self.assistant.say(_("news.sources.all.ask"))
                all_sources_status = self.assistant.listen().decipher()
                sources_available = self.all_sources()
                if all_sources_status.upper() in self.AFFIRMATIVE:
                    response = _("news.sources.all")
                    sources_available_list = list(sources_available)
                    for source_name in sources_available_list:
                        response += " %s," % source_name
                    response += _("news.sources.select")
                    self.assistant.say(response)
                    source_command = self.assistant.listen().decipher()
                    all_sources_status = source_command
                source_found = self.search(list(sources_available), all_sources_status)
                source = source_found
        if source:
            sort_bys_available = self.get_sort_bys_of_source(source)
            if len(sort_bys_available) == 1:
                sort_by = sort_bys_available[0]
            else:
                if len(sort_bys_available) == 2:
                    response = _("news.sort.two_options").format(sort_bys_available[0], sort_bys_available[1])
                else:
                    response = _("news.sort.three_options").format(
                        sort_bys_available[0],
                        sort_bys_available[1],
                        sort_bys_available[2],
                    )
                self.assistant.say(response)
                sort_by_command = self.assistant.listen().decipher()
                sort_by = self.search(sort_bys_available, sort_by_command)
        else:
            self.assistant.say(_("news.sort.described_options"))
            sort_status_command = self.assistant.listen().decipher()
            sort_by = self.search(['top', 'popular' 'latest'], sort_status_command)
        if not source:
            if sort_by.lower() == "top":
                source = "google-news"
            elif sort_by.lower() == "latest":
                source = "the-telegraph"
            else:
                source = "time"
        response = self.get_response(source, sort_by)
        return response

    def handle(self):
        source = self.get_configuration("news_source")
        response = self.get_response(source)
        return response

    def get_response(self, source, sort_by=None, threshold=5):
        if self.threshold:
            threshold = self.threshold
        source = source.lower().replace(" ", "-")
        articles = self.articles.get(source, sort_by=sort_by).articles
        articles = articles[:threshold]
        response = _("news.report").format(sort_by, source)
        for article in articles:
            if article['title']:
                response += "%s, " % article['title']
            if article['description']:
                response += "%s, " % article['description']
            if article['author']:
                response += _("news.report.by").format(article['author'])
            response += _("news.report.continue")
        return response

    @staticmethod
    def search(dataset, query):
        values = [0 for _ in range(0, len(dataset))]
        search = query.lower().split()
        upper_threshold = len(search)
        for index, data in enumerate(dataset):
            search_array = data.split()
            for index2, text in enumerate(search_array):
                if index2 >= upper_threshold:
                    break
                threshold = len(search[index2])
                for i in range(0, len(text)):
                    if i >= threshold - 1:
                        break
                    if text[i] == search[index2][i]:
                        values[index] += 1
        max_value = max(values)
        max_index = values.index(max_value)
        return dataset[max_index]
import pandas as pd
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata
import math
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops

key = '96af62a035db45bda517a9ca62a25ac3'

a = Articles(API_KEY=key)
s = Sources(API_KEY=key)


class APIKeyException(Exception):
    def __init__(self, message):
        self.message = message


class InvalidQueryException(Exception):
    def __init__(self, message):
        self.message = message


class ArchiveAPI(object):
    def __init__(self, key=None):
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 03 10:57:46 2017

@author: Sarang
"""

from newsapi.articles import Articles
from newsapi.sources import Sources

f = open("API_KEY.txt")
api_key = f.read()

a = Articles(api_key)
s = Sources(api_key)

print a

#print s.get(category='technology', language='en', country='uk')

import requests
r = requests.get(
    'https://newsapi.org/v1/articles?source=the-next-web&sortBy=latest&apiKey=153cffe401b84aa8ab8f19d01a354747'
)
print r.text