def _news_scraper(news_site_uid): host = configuration()['news_sites'][news_site_uid]['url'] logging.info(f'Beginning scraper for {host}') homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!!') articles.append(article) print(article.title) print(len(articles))
#!/usr/bin/python # Frodo - A web app for monitoring SGE cluster status: https://bitbucket.org/yoavram/frodo # Copyright (c) 2012 by Yoav Ram. # This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. # To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/.from flask import Flask, render_template, request, session, redirect, url_for, jsonify from flask import Flask, render_template, request, session, redirect, url_for, jsonify import time import common import qstat JOB_ID_KEY = 'jobID' cfg = common.configuration() app = Flask(__name__) app.debug = cfg.getboolean('web','development') app.secret_key = cfg.get('web','secret') @app.route('/') def index(): return redirect(url_for('qstat_html')) @app.route('/qstat') @app.route('/qstat/jobID/<int:jobID>') @app.route('/qstat/username/<qusername>') def qstat_html(jobID = None, qusername=None): if 'username' not in session: return redirect(url_for('login')) username = session['username'] password = session['password'] now = time.asctime()
#!/usr/bin/python # Frodo - A web app for monitoring SGE cluster status: https://bitbucket.org/yoavram/frodo # Copyright (c) 2012 by Yoav Ram. # This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. # To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/.from flask import Flask, render_template, request, session, redirect, url_for, jsonify from flask import Flask, render_template, request, session, redirect, url_for, jsonify import time import common import qstat JOB_ID_KEY = "jobID" cfg = common.configuration() app = Flask(__name__) app.debug = cfg.getboolean("web", "development") app.secret_key = cfg.get("web", "secret") @app.route("/") def index(): return redirect(url_for("qstat_html")) @app.route("/qstat") @app.route("/qstat/jobID/<int:jobID>") @app.route("/qstat/username/<qusername>") def qstat_html(jobID=None, qusername=None): if "username" not in session: return redirect(url_for("login")) username = session["username"]
except (HTTPError, MaxRetryError) as e: logger.warning('Error while fetching the article', exc_info=False) if article and not article.body: logger.warning('Invalid article. There is no body') return None return article def _build_link(host, link): if is_well_formed_url.match(link): return link elif is_root_path.match(link): return f'{host}{link}' else: return f'{host}/{link}' if __name__ == '__main__': parser = argparse.ArgumentParser() news_site_choices = list(configuration()['news_sites'].keys()) parser.add_argument('news_site', help='The news site you that want to scrape', type=str, choices=news_site_choices) args = parser.parse_args() _news_scraper(args.news_site)
def __init__(self, news_site_uid, url): self._config = configuration()['news_sites'][news_site_uid] self._queries = self._config['queries'] self._html = None self._visit(url)