def search(STATE=None): if STATE == None: raise Exception('You must provide STATE') if STATE.search_results == []: query = input('Enter search term: ') if query == '': return [], STATE query = parse_search_query(query) params = { 'search_query': query, 'start': 0, 'max_results': 50, 'sortBy': 'relevance', 'sortOrder': 'descending' } query = '&'.join([str(x) + '=' + str(y) for x, y in params.items()]) r = requests.get(endpoint() + query, timeout=10) results = atoma.parse_atom_bytes(r.content).entries STATE.search_results = results else: results = STATE.search_results results = [feed.Feed(r) for r in results] return results, STATE
def fetch(self): get = self.session.get if self.session is not None else requests.get resp = get(self.url, headers=generate_headers(self.url)) try: return parse_atom_bytes(resp.content) except: return parse_rss_bytes(resp.content)
def search(STATE=None): if STATE == None: raise Exception('You must provide STATE') if STATE.search_results == []: print('ti:title au:author abs:abstract co:comment') print('jr:journal ref cat:subj-cat rn:report number') print('BLANK INPUT TO GO BACK') print(' ') query = input('Enter search term: ') if query == '': return [], STATE params = { 'search_query': query, 'start': 0, 'max_results': 50, 'sortBy': 'relevance', 'sortOrder': 'descending' } query = '&'.join([str(x) + '=' + str(y) for x, y in params.items()]) r = requests.get(endpoint() + query, timeout=10) results = atoma.parse_atom_bytes(r.content).entries STATE.search_results = results else: results = STATE.search_results results = [feed.Feed(r) for r in results] return results, STATE
def get_provinces_atoms_url(url, province_code=None): """ Lee el atom general de Catastro Inspire que contiene los diferentes Atoms para cada provincia. Devuelve una lista con url a los atoms y el título. """ response = requests.get(url) feed = atoma.parse_atom_bytes(response.content) atoms_provincias = [] for entry in feed.entries: if province_code is not None: if os.path.basename( entry.links[0].href).split('.')[3] == 'atom_{}'.format( str(province_code).zfill(2)): url = parse_url(entry.links[0].href) title = entry.title.value atoms_provincias.append((url, title)) else: url = parse_url(entry.links[0].href) title = entry.title.value atoms_provincias.append((url, title)) return atoms_provincias
def syncronize(config_file, config, username, last_updated, mastodon): pixelfeed_get = requests.get( 'https://pixelfed.social/users/{}.atom'.format(username)) pixelfeed = atoma.parse_atom_bytes(pixelfeed_get.content) latest_post = pixelfeed.entries[0] last_updated_atom = latest_post.updated if last_updated == last_updated_atom: print("Up-to-date") sys.exit(0) config['updated'] = last_updated_atom with open(config_file, "w") as f: toml.dump(config, f) print("Config file updated") image_name = latest_post.title.value image_url = re.search(r"(?P<url>https?://[^\s]+)", latest_post.summary.value) \ .group("url") \ .rstrip('">') \ .replace("_thumb", "") tmp = tempfile.NamedTemporaryFile(suffix=".jpg") get_image = requests.get(image_url) tmp.write(get_image.content) mastodon_media = mastodon.media_post(tmp.name) mastodon.status_post(image_name, media_ids=mastodon_media['id']) print("Status posted: ", image_name) tmp.close()
def get(self, code_id): """ Bounces the information from 9gag rss api to our own, with caching. Example :: http://127.0.0.1:5000/9GAGComic Arguments: code_id {str} -- Name of the channel Returns: [requests.text] -- Information aqquired from 9gag rss api feed. """ cache_timer = 20 # 20 secs # TODO #Feature, Remove the oldest item in cache if len(cache.keys()) >= 1: # do we have anything stored if (int(time.time()) - max(cache.keys()) < cache_timer): # (OLD TIME - CACHED TIME) < TIME GAP logger.debug('Using Cached Version.') return cache[max(cache.keys())] params = {'code': code_id, 'format': '1'} logger.debug('Getting Fresh Copy.') resp = requests.get('https://9gag-rss.com/api/rss/get', params=params) feed = atoma.parse_atom_bytes(resp.content) cache[int(time.time())] = resp.text return resp.text
def get_posts(self): try: response = requests.get( "http://kempfolds.blogspot.com/feeds/posts/default") feed = atoma.parse_atom_bytes(response.content) return feed except requests.exceptions.RequestException as e: print(e)
async def fetch(self): async with self.session.get(self.url, headers=generate_headers( self.url)) as response: content = await response.read() try: return parse_atom_bytes(content) except: return parse_rss_bytes(content)
def _get_feed(self): """Get issues from rss url""" r = requests.get(self.feed) if r.status_code != requests.codes.ok: log.debug( f'{r.status_code} Error: {r.reason} for url: {self.feed}') return [] return atoma.parse_atom_bytes(r.content).entries
def feeds_public(self, sort="hot"): r = self.session.get("https://ruqqus.com/feeds/" + sort) try: return [ self.post(i.id_.split('/')[-2]) for i in atoma.parse_atom_bytes(r.content).entries ] except atoma.exceptions.FeedXMLError: return r
def get_python_insider_news(): response = requests.get( 'http://feeds.feedburner.com/PythonInsider?fmt=xml') feed = atoma.parse_atom_bytes(response.content) t = 0 d = feedparser.parse( url_file_stream_or_string= 'view-source:http://feeds.feedburner.com/PythonInsider?fmt=xml') entries = d['entries'] t = 0
def _get_feed(feed_content, payload): if payload['source']['type'] == 'atom': feed = atoma.parse_atom_bytes(feed_content) elif payload['source']['type'] == 'rss': feed = atoma.parse_rss_bytes(feed_content) else: raise Exception( "SourceError", "Unkonwn feed type '%s'. Choose 'rss' or 'atom'." % payload['source']['type']) return feed
def get_single_issue_by_atom(issue_id): url = ATOM_URL + "issues" + "/" + issue_id + ".atom" + "?key=" + ATOM_KEY response = requests.get(url, timeout=(3.0, 7.5)) print(url) feed = atoma.parse_atom_bytes(response.content) if feed.entries is None or len(feed.entries) == 0: return False latest_entry = feed.entries[-1] author = latest_entry.authors[0].name content = sanitize_html_tag(latest_entry.content.value) return wrap_long_text("【更新】【{author}】 {content}".format(author=author, content=content))
def crawl_arxiv(categories: List[str], max_results: int = 1000, sleep_time: int = 5, fetch_size: int = 100, output: str = '.'): docs = [] base_url = 'http://export.arxiv.org/api/query?' base_oai = 'http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:{}&metadataPrefix=arXiv' oai_tag = '{http://www.openarchives.org/OAI/2.0/}' meta_list = [] for category in categories: print('Looking up papers in {}'.format(category)) url = "{}search_query=cat:{}&max_results={}&sortBy=lastUpdatedDate&sortOrder=descending".format( base_url, category, max_results) response = requests.get(url) feed = atoma.parse_atom_bytes(response.content) entries = feed.entries for entry in tqdm(entries): entry_link = entry.id_ entry_index = entry_link.rfind('/') entry_id = entry_link[entry_index + 1:] version_marker = entry_id.rfind('v') entry_id = entry_id[:version_marker] oai_url = base_oai.format(entry_id) metadata_response = requests.get(oai_url) if metadata_response.status_code == 200: metadata = metadata_response.text root = ET.fromstring(metadata) record = root.find('{}GetRecord'.format(oai_tag)) if record is not None: license_link = find_license(record) if is_cc_license(license_link): setattr(entry, 'license', license_link) meta = download_document(entry, output) docs.append(entry) meta_list.append(meta) if len(docs) >= fetch_size: break sleep(sleep_time) if len(docs) >= fetch_size: print("I found what I was looking for. We can stop searching.") break print('Found {} documents'.format(len(docs))) with open('{}/meta.json'.format(output), 'w') as fout: json.dump(meta_list, fout) return docs, meta_list
def get_context(self, request, *args, **kwargs): context = super(HomePage, self).get_context(request, *args, **kwargs) # Quick links quick_links = QuickLink.objects.all().order_by("result_weighting", "title") context["quick_links"] = quick_links # News news_items = (NewsPage.objects.live().public().order_by( "-pinned_on_home", "home_news_order_pages__order", "-first_published_at", )[:8]) context["news_items"] = news_items # Tweets if not cache.get("homepage_tweets"): cache.set( "homepage_tweets", sorted(get_tweets(), key=lambda x: x.created_at, reverse=True), 3000, ) context["tweets"] = cache.get("homepage_tweets")[:3] # What's popular context["whats_popular_items"] = WhatsPopular.objects.all() # How do I context["how_do_i_items"] = (HowDoI.objects.filter( include_link_on_homepage=True).live().public().order_by( "title", )[:10]) # GOVUK news if not cache.get("homepage_govuk_news"): govuk_news_feed_url = "https://www.gov.uk/search/news-and-communications.atom?organisations%5B%5D=department-for-international-trade" response = requests.get(govuk_news_feed_url) feed = atoma.parse_atom_bytes(response.content) cache.set( "homepage_govuk_news", feed.entries[:6], 3000, ) context["govuk_feed"] = cache.get("homepage_govuk_news") return context
def __call__(self) -> Optional[str]: response = self.session.get( F'https://github.com/{self.user}/{self.repo}/releases.atom') response.raise_for_status() feed = atoma.parse_atom_bytes(response.content) versions = [] for entry in feed.entries: title = entry.title.value if any(block in title.lower() for block in self.VERSION_BLOCKLIST): continue version = self.version_from_title(title) if version: versions.append(packaging.version.parse(version)) return str(max(versions)) if len(versions) > 0 else None
def get_weather(self): r = requests.get(self.link) if r.status_code != 200: raise ValueError("request returned HTTP response of {}".format( r.status_code)) feed = atoma.parse_atom_bytes(r.content) self.title = feed.title.value self.updated = feed.updated current_conditions = None self.forecast = OrderedDict() self.alerts = [] for entry in feed.entries: if entry.categories[0].term == self.translate( 'Current Conditions'): if current_conditions is not None: raise ValueError( "There is more than one current conditions...") current_conditions = entry elif entry.categories[0].term == self.translate( 'Warnings and Watches'): if entry.summary.value == self.translate( 'No watches or warnings in effect.'): self.has_alerts = False else: self.has_alerts = True self.alerts.append(entry) elif entry.categories[0].term == self.translate( 'Weather Forecasts'): e = entry.title.value.split(':')[0] self.forecast[e] = entry else: print( "Error, unidentified category {}. Notify developer".format( entry.categories[0].term)) self.current_conditions = {} self.current_summary = current_conditions.title.value current_conditions = html.unescape( current_conditions.summary.value).split('<br/>\n') for entry in current_conditions: entry_elems = entry.split(':</b>') self.current_conditions[entry_elems[0][3:].strip( )] = entry_elems[1].split('<br/>')[0].strip()
def getListOfRecentCommits(repo_name: str) -> Generator[dict, None, None]: # Fetch commit feed commit_feed_raw = requests.get(f"https://github.com/{repo_name}/commits.atom") # Turn into atom object commit_feed = atoma.parse_atom_bytes(commit_feed_raw.content) # Handle each entry for commit in commit_feed.entries: # Build data output yield { "name": commit.title.value, "author": commit.authors[0].name, "date": commit.updated.strftime("%b %d, %Y"), "number":commit.links[0].href.split("/")[-1] } return
def main(): repos_list = get_repos_list(REPOS_FILENAME) urls = map(create_url, repos_list) atom_feeds = map(get_atom_feed, urls) parsed_atom_feeds = map(lambda x: atoma.parse_atom_bytes(x), atom_feeds) releases_last_24h = list(retrieve_releases_in_last_24h(parsed_atom_feeds)) version_list = [] url_list = [] for i in range(len(releases_last_24h)): versions_list = list(map(lambda x: x.title.value, releases_last_24h[i])) version_urls = list(map(lambda x: x.links, releases_last_24h[i])) version_list.extend(versions_list) url_list.extend(version_urls) url_decoded_list = [] for j in range(len(url_list)): ver_urls = list(map(lambda x: x.href, url_list[j])) url_decoded_list.extend(ver_urls) pd.set_option('display.max_colwidth', -1) df = pd.DataFrame(list(zip(version_list, url_decoded_list)), columns=['version', 'url']) df['repo'] = df['url'].str.split('/').str[4] if len(version_list) < 1: print("No new versions released") else: print('New versions released: ' + '\n' + tabulate(df, headers='keys', tablefmt="psql", showindex=False)) for i in range(len(df)): data = { 'version': [df.loc[i, 'version']], 'url': [df.loc[i, 'url']], 'repo': [df.loc[i, 'repo']] } df2 = pd.DataFrame(data, columns=['version', 'url', 'repo']) data = np.squeeze(np.asarray(df2)) slack.send_slack_message(data)
def get_municipality_atoms_url(atom_url, codmun=None): """ Lee el atom específico para cada parroquia. Devuelve el url del Atom de cada municipio con su epsg. Se puede pasar un parámetro codmun para devolver sólo este municipio. """ response = requests.get(atom_url) feed = atoma.parse_atom_bytes(response.content) urls = [] for entry in feed.entries: url = parse_url(entry.links[0].href) epsg = entry.categories[0].term.split('/')[-1] codmun_atom = os.path.basename(url).split('.')[4] if codmun is None or codmun == codmun_atom: urls.append((url, epsg)) return urls
def main(): # pylint: disable=too-many-locals """Intended to be used as part of a GitHub Action. Requires INPUT_LAYERID, INPUT_TIMEFRAME and INPUT_UNITS environment variables to be set within the environment that the script is run. Prints "set-output" commands that create Outputs within GitHub Actions. The Outputs created are: * updateFound - True if an update was found within the specified timeframe, otherwise False * publishedTime - The time the data update was published in the 'Pacific/Auckland' timezone * totalFeatures - The total number of features in the entire dataset after the update * adds - The number of added features in the update * modifies - The number of modified features in the update * deletes - The number of deleted features in the update Raises ------ ValueError If environment variable INPUT_UNITS is not either "minutes", "hours" or "days". """ layer_id = os.environ["INPUT_LAYERID"] timeframe = int(os.environ["INPUT_TIMEFRAME"]) units = os.environ["INPUT_UNITS"] if units not in ["minutes", "hours", "days"]: raise ValueError("units should be either 'minutes', 'hours' or 'days'") response = requests.get( f"{KX_SITE_URL}/feeds/layers/{layer_id}/revisions/") feed = atoma.parse_atom_bytes(response.content) update_found = False dataset_title = None revision_number = None total_features = None adds = None modifies = None deletes = None published_time = None todays_date = pendulum.now("UTC") for entry in feed.entries: published_time = pendulum.instance(entry.published) time_since_publish = diff_timeframe(todays_date, published_time, units) if time_since_publish < timeframe: total_features, adds, modifies, deletes, total_changes = extract_feature_counts( entry.summary.value) # Ignore vector / table dataset updates with no feature changes if total_changes == 0: continue update_found = True dataset_title = entry.title.value.split(f" ({layer_id}", 1)[0] revision_number = entry.title.value.rsplit(" ", 1)[-1] # Skip for raster datasets where feature counts are 'None' if total_features: # Add commas as thousands separators on feature counts adds = f"{int(adds):,}" modifies = f"{int(modifies):,}" deletes = f"{int(deletes):,}" total_features = f"{int(total_features):,}" # Find only the most recent change break # Set published_time to None if the dataset update is not within the required timeframe published_time = None # Modify published time to readable format in local timezone if published_time: published_time = published_time.in_timezone(OUTPUT_TIMEZONE) published_time = published_time.format(OUTPUT_TIME_FORMAT) print(f"::set-output name=updateFound::{update_found}") print(f"::set-output name=datasetTitle::{dataset_title}") print(f"::set-output name=revisionNumber::{revision_number}") print(f"::set-output name=publishedTime::{published_time}") print(f"::set-output name=totalFeatures::{total_features}") print(f"::set-output name=adds::{adds}") print(f"::set-output name=modifies::{modifies}") print(f"::set-output name=deletes::{deletes}")
import os import atoma import requests def notify(title, text): os.system( """osascript -e 'display notification "{}" with title "{}"'""".format( text, title)) timestamp_lastcommit = atoma.parse_atom_bytes( requests.get( "https://github.com/CSSEGISandData/COVID-19/commits/master.atom"). content).updated.timestamp() if os.path.getmtime("cntry_stat.json") < timestamp_lastcommit: notify( title="COVID-19 data updated", text="There is a new commit in the COVID-19 data repository!", )
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "-r","--request_interval", default=3600, type=int, help="The request_interval (Default: 3600) means sending request to arxiv api every seconds of request_interval.", ) parser.add_argument( "-q","--arxiv_query", default="nlp+OR+bert", type=str, help="The arxiv_query (default: 'nlp+OR+bert') is specified for automatic retreival of latest papers from arxiv. Space should be replaced with '+'. More refers to: https://arxiv.org/help/api.", ) parser.add_argument( "-d","--days_since", default=10, type=int, help="The days_since (default: 10) defines the number of past days since considered to be an update, e.g., 10 means that ony the papers on ariXv with the update date no more than 10 days ago are considered to be forwarded and posted to Twitter.", ) parser.add_argument( "-t","--hashtags_prepend", default="#NLP,#MachineLearning", type=str, help="The list of hashtags (default: '#NLP,#MachineLearning') you want to prepend the tweet, seperated by ','.", ) args = parser.parse_args() ARXIV_QUERY = args.arxiv_query REQUEST_INTERVAL=args.request_interval HASHTAGS2PREPEND=args.hashtags_prepend.split(",") # verification auth = tweepy.OAuthHandler(TWITTER_APP_KEY,TWITTER_APP_SECRET) auth.set_access_token(TWITTER_KEY, TWITTER_SECRET) api = tweepy.API(auth) import urllib.request # for details of arxiv api, see: https://arxiv.org/help/api/user-manual#title_id_published_updated url = 'http://export.arxiv.org/api/query?search_query=all:'+ARXIV_QUERY+'&start=0&max_results=1&sortBy=lastUpdatedDate&sortOrder=descending' data = urllib.request.urlopen(url).read() last_timestamp=datetime.timestamp(datetime.now())-args.days_since*3600*24 logger.info("Start listening with arguments: "+str(args)) while True: data_obj=atoma.parse_atom_bytes(data) next_timestamp=datetime.timestamp(data_obj.updated) logger.info("Get a paper from arXiv updated at " + str(data_obj.updated)+": "+data_obj.entries[0].id_) # if next_timestamp > last_timestamp. that means update found and thus call Twitter API to update status if (next_timestamp-last_timestamp)>0.0: # post a tweet on Twitter post_content=" ".join(HASHTAGS2PREPEND)+' new arxiv '+ ARXIV_QUERY +' related paper: '+data_obj.+entries[0].id_'\n'+data_obj.entries[0].title.value+"." if len(post_content)>280: # twitter allows a tweet up to 280 characters post_content=post_content[:277]+"..." # post try: response=api.update_status(post_content,tweet_mode="extended") except TweepError as err: # if there is an error, wait REQUEST_INTERVAL seconds before next call and thus last_timestamp set to be next_timestamp+1 last_timestamp=next_timestamp+1 logger.error(err) logger.error("Go to sleep "+str(REQUEST_INTERVAL)+" seconds") continue if response.full_text != None: logger.info("Successfully post the tweet:\n===================\n"+ response.full_text+"\n===================") else: logger.error(response) last_timestamp=next_timestamp else: logger.info("Start sleeping given days_since =",str(args.days_since),"days") time.sleep(REQUEST_INTERVAL) logger.info("No update in the last "+str(REQUEST_INTERVAL)+" seconds")
def get_psf_news(): response = requests.get('http://pyfound.blogspot.com/atom.xml') feed = atoma.parse_atom_bytes(response.content) t = 0
def query_arxiv(query): response = req.get(ARXIV_BASE_URL, params={'search_query': query}) feed = atoma.parse_atom_bytes(response.content) return feed
#/usr/bin/python3 import atoma import requests import re import pandas as pd #Proxy support proxies = { "http": "http://<<proxy>>:80", "https": "http://<<proxy>>:80", } response = requests.get('https://github.com/security-advisories', proxies=proxies) feed = atoma.parse_atom_bytes(response.content) new_items = [] #Collect all Vulnerabilities with a CVE for vuln in feed.entries: new_item = {} new_item['Id'] = re.findall(r'\[(.*?)\]', vuln.title.value) new_item['Published'] = vuln.published.strftime('%Y/%m/%d') new_item['Updated'] = vuln.updated.strftime('%Y/%m/%d') new_item['Title'] = re.findall(r'\s.*', vuln.title.value) new_item['Cve'] = re.findall(r'CVE-\d{4}-\d{4,7}', vuln.content.value) new_items.append(new_item) print(new_items) df = pd.DataFrame(
def get_deals(client, conditions, currencies, minimum_discount): for condition in args.condition: for currency in args.currency: wantlist_url = f'{WWW}/sell/mpmywantsrss' wantlist_params = { 'output': 'rss', 'user': DISCOGS_USER, 'condition': condition, 'currency': currency, 'hours_range': '0-12', } feed = atoma.parse_atom_bytes( get(client, wantlist_url, wantlist_params).encode('utf8') ) for entry in feed.entries: try: listing_id = entry.id_.split('/')[-1] listing = call_public_api( client, f'/marketplace/listings/{listing_id}' ) if listing['seller']['username'] in BLOCKED_SELLERS: continue seller_rating = get_seller_rating(listing) price = get_total_price(listing) release_year = get_release_year(listing) release_id = listing['release']['id'] min_price, median_price, max_price = get_price_statistics( client, release_id ) suggested_price = get_suggested_price( client, release_id, condition ) demand_ratio = get_demand_ratio(client, release_id) has_sold = True if price is None: continue # adjust price for standard domestic shipping price = price - STANDARD_SHIPPING release_age = date.today().year - release_year if condition == CONDITIONS['VG+']: if release_age < ALLOW_VG['minimum_age']: continue if seller_rating < ALLOW_VG['minimum_seller_rating']: continue if median_price is None: has_sold = False if has_sold: if not price < median_price: continue difference_from_median = difference( price, median_price) difference_from_suggested = difference( price, suggested_price) difference_from_min = difference( price, min_price) difference_from_max = difference( price, max_price) minimum = minimum_discount if demand_ratio < 2 else 5 if difference_from_median < minimum: continue debug( f'\n{entry.title.value}\n' f'{entry.summary.value}\n' f'price: ${price:.2f}\n' f'demand ratio: {demand_ratio:.1f}\n' f'seller rating: {seller_rating:.1f}\n' f'release year: {release_year}' ) if has_sold: debug( f'median price: ${median_price:.2f}\n' f'suggested price: ${suggested_price:.2f}\n' f'lowest price: ${min_price:.2f}\n' f'highest price: ${max_price:.2f}\n' f'difference from median: ' f'{difference_from_median}%\n' f'difference from suggested: ' f'{difference_from_suggested}%\n' f'difference from lowest: ' f'{difference_from_min}%\n' f'difference from highest: ' f'{difference_from_max}%\n' ) summary = ( f'<b>{summarize_difference(difference_from_median)}' f' median price (${median_price:.2f})</b><br>' f'{summarize_difference(difference_from_suggested)}' f' suggested price (${suggested_price:.2f})<br>' f'{summarize_difference(difference_from_min)}' f' min price (${min_price:.2f})<br>' f'{summarize_difference(difference_from_max)}' f' max price (${max_price:.2f})<br>' f'demand ratio: {demand_ratio:.1f}<br><br>' f'{entry.summary.value}' ) else: debug('never sold\n') summary = ( f'<b>never sold</b><br>' f'demand ratio: {demand_ratio:.1f}<br><br>' f'{entry.summary.value}' ) yield { 'id': entry.id_, 'title': entry.title.value, 'updated': isoformat(entry.updated), 'summary': summary, } except DealException as e: log_error(e, entry) except httpx.HTTPError as e: debug(e)
def scarica_feed() -> list: r = requests.get(ATOM_FEED) if r.status_code == 200: return lista_articoli(parse_atom_bytes(r.content).entries) else: return []