def get_urls(criteria): result = {'urls': [], 'type': None} if (criteria.lower() in supported_topics): result['urls'] = urls(topic=criteria) result['type'] = 'topic' if (criteria.upper() in supported_countries): result['urls'] = urls(country=criteria) result['type'] = 'country' if (criteria.lower() in supported_urls): result['urls'] = [criteria.lower()] result['type'] = 'website' return result
def retrieve_urls(country=None, language=None, topic=None): """ This function calls the newscatcher api and returns the urls to be invoked. """ try: if country: country = None if country == "ALL" else validate_2_char_iso_code( country) if language: language = None if language == "ALL" else validate_2_char_iso_code( language) if topic: topic = None if topic == "ALL" else validate_topic(topic) logger.info( f"Parameters to retrieve list are - country:{country}, language:{language}, topic:{topic}" ) url_list = newscatcher.urls(country=country, language=language, topic=topic) logger.debug(f"retrieved url list: {url_list}") return url_list except TypeError: logger.error("Fetching urls threw an Error") raise
def __init__(self, topic: str, language: str = 'en', n=None): self.urls = (urls(topic=topic, language=language)) if (n != None): self.urls = self.urls[:n] self.topic = topic print(f"found {len(self.urls)} urls for {topic} topic") print(self.urls)
def available_topics(): topics = set() for i in urls(): try: des = describe_url(i) if des == None or des['topics'] == None: continue site_topics = des['topics'] for j in site_topics: topics.add(j) except Exception as e: # print(i) pass return topics
async def news_urls(self, ctx): aus_substring = ".au" blog_substring = "blog" english_urls = urls(language='en') pprint(english_urls) send_type = "The enlish_urls list registers as a {} python variable.".format( type(english_urls)) send_len = "The overall list is {} urls long.".format( len(english_urls)) send_aus = "Of those urls, {} contain .au somewhere.".format( len([i for i in english_urls if aus_substring in i])) send_blog = "Lastly, {} refer to themselves as a 'blog' in some way.".format( len([i for i in english_urls if blog_substring in i])) await ctx.send(send_type) await ctx.send(send_len) await ctx.send(send_aus) await ctx.send(send_blog) await ctx.send("The Australian-identifying urls are:") for i in english_urls: if ".au" in i: await ctx.send(i)
def newscatchernews(): newslist = [] searchcriteria = None IndianURLs = urls(country='IN') searchcriteria = request.GET.get('search') #print("criteria:",searchcriteria) det = [] counter = 0 for IndianURL in IndianURLs: nc = Newscatcher(website=IndianURL) results = nc.get_news() if results is not None and results['articles'] is not None: articles = results['articles'] for article in articles: datesfound = datefinder.find_dates(article.published) dateresult = "x" for match in datesfound: dateresult = match.strftime("%Y-%m-%d %H:%M") txt = list(article.summary_detail.values())[3] detailtext = BeautifulSoup(txt, "html.parser").get_text() counter = counter + 1 newslist = newslist + [{ 'Source': IndianURL, 'Title': article.title, 'Published': dateresult, 'Summary_Detail': detailtext, 'link': article.link, 'id': "head_" + str(counter) }] return newslist
def describe_sources(source): nc = Newscatcher(website=source) nc.print_headlines(n=10) urls_pol = urls(topic='politics') describe_url(urls_pol[1]) res = Newscatcher(website=urls_pol[1], topic='politics')
handler.setLevel(logging.DEBUG) app.logger.addHandler(handler) app.logger.info('starting') loop = asyncio.get_event_loop() # not yet used topics = [ 'tech', 'news', 'business', 'science', 'finance', 'food', 'politics', 'economics', 'travel', 'entertainment', 'music', 'sport', 'world' ] twp = [] with open('twp.txt', 'r') as f: twp = f.read().split() sites = urls(language='EN') HN = 'https://hacker-news.firebaseio.com/v0' res = requests.get( 'https://www.bing.com/HPImageArchive.aspx?format=js&idx=0&n=1') bingurl = res.json() bimgurl = 'https://bing.com/' + bingurl['images'][0]['url'] @app.route('/') def myindex(): app.logger.info("myindex") return redirect(url_for('do_hn')) @app.route('/twp')
except Exception as e: print(website) return None # print(a+a) # for c in count: # # print(f"{c}:{keyword_list.count(c)}") def download_all_sites(sites): r = Parallel(n_jobs=8, verbose=10)(delayed(get_trend_for_website)(i, 10000) for i in sites) # res, i = zip(*r) df = r[0] for i in range(1, len(r)): if (not r[i] is None): df = df.add(r[i], fill_value=0) df = pd.DataFrame(df) print(df.sort_values()) # a = (df['keyword'].value_counts(dropna=False)[:20]) # print(a) # print(df.columns.values) # print(df.sort(['ke', 'B'], ascending=[1, 0])) print(len(urls(language="en"))) sites = urls(language="en")[:2] print(sites) download_all_sites(sites)
def __init__(self, topic): self.topic = topic self.sources = urls(topic=topic, language="en")
from newscatcher import describe_url from newscatcher import Newscatcher from newscatcher import urls url = 'news.ycombinator.com' url2 = 'ycombinator.com' eng_lnx = urls(language='en') nc = Newscatcher(website=url) try: print("looking for " + url + "...") nc.get_news() except Exception as e: print(repr(e)) describe_url(url) print(url + ' in urls: ' + str(url in eng_lnx)) print(url2 + ' in urls: ' + str(url2 in eng_lnx)) nc2 = Newscatcher(website='ycombinator.com') try: print("looking for " + url2 + "...") nc2.get_news() except Exception as e: print(repr(e))
from newscatcher import Newscatcher, urls nc = Newscatcher(website='wsj.com') results = nc.get_news() # # # results.keys() # # 'url', 'topic', 'language', 'country', 'articles' # # # Get the articles # articles = results['articles'] # # first_article_summary = articles[0]['summary'] # first_article_title = articles[0]['title'] # finance = urls(topic='finance') # # # URLs by COUNTRY # american_urls = urls(country='US') # # # URLs by LANGUAGE # english_urls = urls(language='en')