def get_question_feed(url, force_reload=False): """Retrieve the last questions of the feed Returns a structure with the following format: [Question_1, Question_2, ...] where Question_n has the following keys: link: str title: str body: str (html) tags: list of str """ log(bold("Fetching question feed")) if force_reload: log(fg("Force reload", magenta)) feed = spider.get_feed(url, force_reload=force_reload) if feed.status == 304: # Not Modified log(fg("Feed not modified since last retrieval (status 304)", magenta)) return [] log("Number of entries in feed: {}", fg(len(feed.entries), green)) questions = [] for entry in feed.entries: soup = BeautifulSoup(entry.summary, "html.parser") q = { "link": entry.link, "title": entry.title, "body": soup.getText(" ", strip=True), "tags": [x["term"] for x in entry.tags], } questions.append(q) return questions
def warn(msg, *argv): """Print an error message and aborts execution""" err_off = sys.stderr not in _logs if err_off: add_stderr() log(fg(msg, magenta), *argv, who=_get_caller()) _logs.pop()
def abort(msg, *argv): """Print an error message and aborts execution""" if sys.stderr not in _logs: add_stderr() log(fg(msg, red), *argv, who=_get_caller()) print_advice() close_logs() exit()
def get(url, delay=2, use_cache=True, max_delta=td(hours=12)): """Respectful wrapper around requests.get""" useragent = "Answerable v0.1" # If a cached answer exists and is acceptable, then return the cached one. cache_file = url.replace("/", "-") if use_cache: log("Checking cache before petition {}", fg(url, yellow)) hit, path = cache.check("spider", cache_file, max_delta) if hit: with open(path, "r") as fh: res = fh.read().replace("\\r\\n", "") return _FalseResponse(200, res) # If the robots.txt doesn't allow the scraping, return forbidden status if not ask_robots(url, useragent): log(fg("robots.txt forbids {}", red), url) return _FalseResponse(403, "robots.txt forbids it") # Make the request after the specified delay # log("[{}] {}".format(fg("{:4.2f}".format(delay), yellow), url)) log("Waiting to ask for {}", fg(url, yellow)) log(" in {:4.2f} seconds", delay) sleep(delay) headers = {"User-Agent": useragent} log("Requesting") res = requests.get(url, timeout=10, headers=headers) # Exit the program if the scraping was penalized if res.status_code == 429: # too many requests abort("Too many requests") # Cache the response if allowed by user if use_cache: cache.update("spider", cache_file, res.content.decode(res.encoding), json_format=False) return res
def update(category: str, _file: str, obj, json_format=True): """Update or create a file in the cache Parameters: category: Folder inside the cache. _file: File name to store in. obj: Serializable object to store. """ subpath = pathlib.Path(category) / _file path = pathlib.Path.cwd() / __cache_dir / subpath path.parent.mkdir(parents=True, exist_ok=True) try: with open(path, "w") as fh: if json_format: json.dump(obj, fh, indent=2) else: fh.write(obj) log(" Cache updated: {}", fg(subpath, green)) except OSError as err: log(" {}: {}", err, fg(subpath, magenta)) return False, path
def check(category: str, _file: str, max_delta: td) -> (bool, pathlib.Path): """Return if a file is cached and where it is located. Returns: (B, P) where - B is true if the content is cached and usable - P is the path where the cached content is/should be. Parameters: category: Folder inside the cache. _file: File name to look for. max_delta: Timedelta used as threshold to consider a file too old. """ # Prepare the path to the cached file subpath = pathlib.Path(category) / _file path = pathlib.Path.cwd() / __cache_dir / subpath path.parent.mkdir(parents=True, exist_ok=True) try: if not path.exists(): log(" Miss {}", fg(subpath, magenta)) return False, path else: # Check if the file is too old log(" Hit {}", fg(subpath, green)) modified = dt.fromtimestamp(path.stat().st_mtime) now = dt.now() delta = now - modified log(" Time passed since last fetch: {}", delta) valid = delta < max_delta if valid: log(fg(" Recent enough", green)) else: log(fg(" Too old", magenta)) return valid, path except OSError as err: log(" {}: {}", err, fg(subpath, magenta)) return False, path
def get_QA(user_id, force_reload=False, max_page=5): """Retrieve information about the questions answered by the user Return [ (Question_1, Answer_1), (Question_2, Answer_2), ... ] See get_questions, get_user_answers """ log(bold("Fetching user information")) if force_reload: log(fg("Force reload", magenta)) cache_file = str(user_id) + ".json" # Check cache if not force_reload: hit, fpath = cache.check(cache_where, cache_file, cache_threshold) if hit: with open(fpath) as fh: stored = json.load(fh) return stored # Get the answers answers = get_user_answers(user_id, force_reload, max_page) # Get the questions q_ids = [str(a["question_id"]) for a in answers] questions = get_questions(q_ids) # Join answers and questions user_qa = [(q, a) for q in questions for a in answers if q["question_id"] == a["question_id"]] cache.update(cache_where, cache_file, user_qa) for q, a in user_qa: a["tags"] = q["tags"] ## Include questions specified by user try: with open("include.txt", "r") as f: extra_q_ids = f.read().split() log("Aditional training: " + str(extra_q_ids)) extra_questions = get_questions(extra_q_ids) except FileNotFoundError: extra_questions = [] log("No additional training specified by user") user_qa += [(q, None) for q in extra_questions] return user_qa
def get_feed(url, force_reload=False): """Get RSS feed and optionally remember to reduce bandwith""" useragent = "Answerable RSS v0.1" log("Requesting feed {}", fg(url, yellow)) cache_file = url.replace("/", "_") # Get the conditions for the GET bandwith reduction etag = None modified = None if not force_reload: hit, path = cache.check("spider.rss", cache_file, td(days=999)) if hit: with open(path, "r") as fh: headers = json.load(fh) etag = headers["etag"] modified = headers["modified"] log("with {}: {}", bold("etag"), fg(etag, yellow)) log("with {}: {}", bold("modified"), fg(modified, yellow)) # Get the feed feed = feedparser.parse(url, agent=useragent, etag=etag, modified=modified) # Store the etag and/or modified headers if feed.status != 304: etag = feed.etag if "etag" in feed else None modified = feed.modified if "modified" in feed else None new_headers = { "etag": etag, "modified": modified, } cache.update("spider.rss", cache_file, new_headers) log("Stored new {}: {}", bold("etag"), fg(etag, green)) log("Stored new {}: {}", bold("modified"), fg(modified, green)) return feed
def cf(x): """Color a value according to its value""" return (displayer.fg(x, displayer.green) if x == 0 else displayer.fg( x, displayer.magenta))
def recommend(args): """Recommend questions from the latest unanswered""" filtered = {"hidden": 0, "closed": 0, "duplicate": 0} def valid_entry(entry): """Check if a entry should be taken into account""" if len(set(entry["tags"]) & hide_tags) > 0: filtered["hidden"] += 1 return False if entry["title"][-8:] == "[closed]": filtered["closed"] += 1 return False if entry["title"][-11:] == "[duplicate]": filtered["duplicate"] += 1 return False return True def cf(x): """Color a value according to its value""" return (displayer.fg(x, displayer.green) if x == 0 else displayer.fg( x, displayer.magenta)) # Load configuration config = load_config(args) # Load the model try: model_name = config["model"] log.log("Loading model {}", displayer.fg(model_name, displayer.yellow)) model = importlib.import_module(f".{model_name}", "models") log.log("Model {} succesfully loaded", displayer.fg(model_name, displayer.green)) except ModuleNotFoundError as err: if err.name == f"models.{model_name}": log.abort("Model {} not present", model_name) else: log.abort("Model {} unsatisfied dependency: {}", model_name, err.name) # Get user info and feed user_qa = fetcher.get_QA(config["user"], force_reload=args.f) if args.all or "tags" not in config: tags = "" else: tags = "tag?tagnames=" tags += "%20or%20".join(config["tags"]["followed"]).replace("+", "%2b") tags += "&sort=newest" url = "https://stackoverflow.com/feeds/" + tags try: feed = fetcher.get_question_feed(url, force_reload=args.F) if len(feed) == 0: raise ValueError("No feed returned") # Filter feed from ignored tags hide_tags = (set() if args.all or "tags" not in config else set( config["tags"]["ignored"])) useful_feed = [e for e in feed if valid_entry(e)] if len(useful_feed) == 0: raise ValueError("All feed filtered out") log.log( "Discarded: {} ignored | {} closed | {} duplicate", cf(filtered["hidden"]), cf(filtered["closed"]), cf(filtered["duplicate"]), ) # Make the recommendation log.log(f"Corpus size: {len(user_qa)} Feed size: {len(useful_feed)}") rec_index, info = model.recommend(user_qa, useful_feed) selection = [useful_feed[i] for i in rec_index[:args.limit]] if args.info and info is None: log.warn("Info requested, but model {} returns None", model_name) elif args.info and info is not None: info = [info[i] for i in rec_index[:args.limit]] displayer.disp_feed(selection, info, args.info) except ValueError as err: log.warn(err) log.print_advice()