def parts(feed): """Returns tuple of feed parts""" # url_to_file_name can also be used on strings feed_name = fu.baseurl_to_file_name(feed[1]) region = fu.url_to_file_name(feed[2]) topic = fu.url_to_file_name(feed[3]) return (feed_name, region, topic)
def _process(feed): hour = tu.ts_to_hour(datetime.now().isoformat()) output_dir = os.path.join( DATA_PATH, fu.datestamp_to_path(hour), ) os.makedirs(output_dir, exist_ok=True) output_file = fu.url_to_file_name(feed.url) + fu.RAW_FEED output_path = os.path.join(output_dir, output_file) with open(output_path, 'w') as out: r = requests.get(feed.url, timeout=15) r.raise_for_status() out.write(r.text) output = { 'name':fu.baseurl_to_file_name(feed.url), 'path':output_path, 'region':feed.region, 'topic':feed.topic, 'url':feed.url, } PRODUCER.poll(0) PRODUCER.produce( 'raw_rss', json.dumps(output).encode('utf-8'), callback=delivery_report, )
def _parse_article(article_path, article): data_path = os.path.join(fu.get_mount_folder(), fu.FEED_BASE) output_dir = os.path.join( data_path, article['region'], article['topic'], tu.ts_to_month(datetime.now().isoformat()), article['name'], ) os.makedirs(output_dir, exist_ok=True) output_file = fu.url_to_file_name(article['url']) + fu.PARSED_ARTICLE output_path = os.path.join(output_dir, output_file) parser = ArticleParser() title, _, authors, publish_date = parser.parse( article_path, output_path, article['url'], ) return { 'title': title, 'authors': authors, 'publish_date': publish_date or datetime.now().isoformat(), 'path': output_path, 'article': article, }
def process_feed(feed): feed_name = '{}_{}_{}'.format( feed.region, feed.topic, fu.url_to_file_name(feed.url), ) feed_name = feed_name.replace(' ', '_') summary_name = '{}_feed_processing_seconds'.format(feed_name) process_feed_time = REGISTRY.get_summary(summary_name, 'Time spent processing feed') errors_name = '{}_feed_processing_exceptions'.format(feed_name) count_errors = REGISTRY.get_counter(errors_name, 'Exceptions processing feed') @process_feed_time.time() def _process(feed): hour = tu.ts_to_hour(datetime.now().isoformat()) output_dir = os.path.join( DATA_PATH, fu.datestamp_to_path(hour), ) os.makedirs(output_dir, exist_ok=True) output_file = fu.url_to_file_name(feed.url) + fu.RAW_FEED output_path = os.path.join(output_dir, output_file) with open(output_path, 'w') as out: r = requests.get(feed.url, timeout=15) r.raise_for_status() out.write(r.text) output = { 'name':fu.baseurl_to_file_name(feed.url), 'path':output_path, 'region':feed.region, 'topic':feed.topic, 'url':feed.url, } PRODUCER.poll(0) PRODUCER.produce( 'raw_rss', json.dumps(output).encode('utf-8'), callback=delivery_report, ) try: _process(feed) return 0 except Exception as e: print('{}'.format(repr(e))) count_errors.inc() return 1
def _get_article(article): data_path = os.path.join(fu.get_mount_folder(), fu.FEED_BASE) output_dir = os.path.join( data_path, article['region'], article['topic'], tu.ts_to_month(datetime.now().isoformat()), article['name'], ) os.makedirs(output_dir, exist_ok=True) output_file = fu.url_to_file_name(article['url']) + fu.RAW_ARTICLE output_path = os.path.join(output_dir, output_file) # Short circuit if os.path.exists(output_path): return None with open(output_path, 'w') as out: res = requests.get(article['url'], timeout=15) res.raise_for_status() out.write(res.text) return output_path