Example #1
0
def get_urls(criteria):
    result = {'urls': [], 'type': None}
    if (criteria.lower() in supported_topics):
        result['urls'] = urls(topic=criteria)
        result['type'] = 'topic'
    if (criteria.upper() in supported_countries):
        result['urls'] = urls(country=criteria)
        result['type'] = 'country'
    if (criteria.lower() in supported_urls):
        result['urls'] = [criteria.lower()]
        result['type'] = 'website'
    return result
def retrieve_urls(country=None, language=None, topic=None):
    """
    This function calls the newscatcher api and returns the urls to be invoked.
    """
    try:
        if country:
            country = None if country == "ALL" else validate_2_char_iso_code(
                country)

        if language:
            language = None if language == "ALL" else validate_2_char_iso_code(
                language)

        if topic:
            topic = None if topic == "ALL" else validate_topic(topic)

        logger.info(
            f"Parameters to retrieve list are - country:{country}, language:{language}, topic:{topic}"
        )
        url_list = newscatcher.urls(country=country,
                                    language=language,
                                    topic=topic)
        logger.debug(f"retrieved url list: {url_list}")
        return url_list

    except TypeError:
        logger.error("Fetching urls threw an Error")
        raise
Example #3
0
    def __init__(self, topic: str, language: str = 'en', n=None):
        self.urls = (urls(topic=topic, language=language))
        if (n != None):
            self.urls = self.urls[:n]
        self.topic = topic

        print(f"found {len(self.urls)} urls for {topic} topic")
        print(self.urls)
Example #4
0
 def available_topics():
     topics = set()
     for i in urls():
         try:
             des = describe_url(i)
             if des == None or des['topics'] == None:
                 continue
             site_topics = des['topics']
             for j in site_topics:
                 topics.add(j)
         except Exception as e:
             # print(i)
             pass
     return topics
Example #5
0
    async def news_urls(self, ctx):
        aus_substring = ".au"
        blog_substring = "blog"

        english_urls = urls(language='en')
        pprint(english_urls)
        send_type = "The enlish_urls list registers as a {} python variable.".format(
            type(english_urls))
        send_len = "The overall list is {} urls long.".format(
            len(english_urls))
        send_aus = "Of those urls, {} contain .au somewhere.".format(
            len([i for i in english_urls if aus_substring in i]))
        send_blog = "Lastly, {} refer to themselves as a 'blog' in some way.".format(
            len([i for i in english_urls if blog_substring in i]))
        await ctx.send(send_type)
        await ctx.send(send_len)
        await ctx.send(send_aus)
        await ctx.send(send_blog)
        await ctx.send("The Australian-identifying urls are:")
        for i in english_urls:
            if ".au" in i:
                await ctx.send(i)
Example #6
0
def newscatchernews():
    newslist = []
    searchcriteria = None
    IndianURLs = urls(country='IN')

    searchcriteria = request.GET.get('search')
    #print("criteria:",searchcriteria)

    det = []
    counter = 0
    for IndianURL in IndianURLs:

        nc = Newscatcher(website=IndianURL)
        results = nc.get_news()

        if results is not None and results['articles'] is not None:
            articles = results['articles']

            for article in articles:
                datesfound = datefinder.find_dates(article.published)
                dateresult = "x"
                for match in datesfound:

                    dateresult = match.strftime("%Y-%m-%d %H:%M")

                    txt = list(article.summary_detail.values())[3]
                    detailtext = BeautifulSoup(txt, "html.parser").get_text()

                    counter = counter + 1
                    newslist = newslist + [{
                        'Source': IndianURL,
                        'Title': article.title,
                        'Published': dateresult,
                        'Summary_Detail': detailtext,
                        'link': article.link,
                        'id': "head_" + str(counter)
                    }]
    return newslist
Example #7
0
def describe_sources(source):
    nc = Newscatcher(website=source)
    nc.print_headlines(n=10)
    urls_pol = urls(topic='politics')
    describe_url(urls_pol[1])
    res = Newscatcher(website=urls_pol[1], topic='politics')
Example #8
0
handler.setLevel(logging.DEBUG)
app.logger.addHandler(handler)
app.logger.info('starting')
loop = asyncio.get_event_loop()

# not yet used
topics = [
    'tech', 'news', 'business', 'science', 'finance', 'food', 'politics',
    'economics', 'travel', 'entertainment', 'music', 'sport', 'world'
]

twp = []
with open('twp.txt', 'r') as f:
    twp = f.read().split()

sites = urls(language='EN')

HN = 'https://hacker-news.firebaseio.com/v0'
res = requests.get(
    'https://www.bing.com/HPImageArchive.aspx?format=js&idx=0&n=1')
bingurl = res.json()
bimgurl = 'https://bing.com/' + bingurl['images'][0]['url']


@app.route('/')
def myindex():
    app.logger.info("myindex")
    return redirect(url_for('do_hn'))


@app.route('/twp')
Example #9
0
    except Exception as e:
        print(website)
        return None

    # print(a+a)
    # for c in count:
    #
    #     print(f"{c}:{keyword_list.count(c)}")


def download_all_sites(sites):
    r = Parallel(n_jobs=8, verbose=10)(delayed(get_trend_for_website)(i, 10000)
                                       for i in sites)
    # res, i = zip(*r)
    df = r[0]
    for i in range(1, len(r)):
        if (not r[i] is None):
            df = df.add(r[i], fill_value=0)
    df = pd.DataFrame(df)
    print(df.sort_values())

    # a = (df['keyword'].value_counts(dropna=False)[:20])
    # print(a)
    # print(df.columns.values)
    # print(df.sort(['ke', 'B'], ascending=[1, 0]))


print(len(urls(language="en")))
sites = urls(language="en")[:2]
print(sites)
download_all_sites(sites)
Example #10
0
 def __init__(self, topic):
     self.topic = topic
     self.sources = urls(topic=topic, language="en")
Example #11
0
from newscatcher import describe_url
from newscatcher import Newscatcher
from newscatcher import urls
url = 'news.ycombinator.com'
url2 = 'ycombinator.com'
eng_lnx = urls(language='en')
nc = Newscatcher(website=url)
try:
    print("looking for " + url + "...")
    nc.get_news()
except Exception as e:
    print(repr(e))
describe_url(url)
print(url + ' in urls: ' + str(url in eng_lnx))
print(url2 + ' in urls: ' + str(url2 in eng_lnx))
nc2 = Newscatcher(website='ycombinator.com')
try:
    print("looking for " + url2 + "...")
    nc2.get_news()
except Exception as e:
    print(repr(e))
Example #12
0
from newscatcher import Newscatcher, urls

nc = Newscatcher(website='wsj.com')
results = nc.get_news()
#
# # results.keys()
# # 'url', 'topic', 'language', 'country', 'articles'
#
# # Get the articles
# articles = results['articles']
#
# first_article_summary = articles[0]['summary']
# first_article_title = articles[0]['title']
#
finance = urls(topic='finance')
#
# # URLs by COUNTRY
# american_urls = urls(country='US')
#
# # URLs by LANGUAGE
# english_urls = urls(language='en')