def test_whitelisted_user(): user_url = 'http://stackoverflow.com/users/2/geoff-dalgas' user = get_user_from_url(user_url) add_whitelisted_user(user) user_url2 = 'http://stackoverflow.com/users/0/test' user2 = get_user_from_url(user_url2) add_whitelisted_user(user2) post = Post(api_response={'title': '', 'body': '', 'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url}, 'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0}) is_spam, reason, _ = check_if_spam(post) assert is_spam is False post = Post(api_response={'title': 'baba ji', 'body': '', 'owner': {'display_name': '', 'reputation': 1, 'link': user_url}, 'site': 'stackoverflow.com', 'question_id': '2', 'IsAnswer': False, 'score': 0}) is_spam, reason, _ = check_if_spam(post) assert is_spam is True post = Post(api_response={'title': 'baba ji', 'body': '', 'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url}, 'site': 'stackoverflow.com', 'question_id': '3', 'IsAnswer': False, 'score': 0}) is_spam, reason, _ = check_if_spam(post) assert is_spam is True post = Post(api_response={'title': 'test', 'body': '', 'owner': {'display_name': 'baba ji - muscle building', 'reputation': 1, 'link': user_url2}, 'site': 'stackoverflow.com', 'question_id': '0', 'IsAnswer': False, 'score': 0}) is_spam, reason, _ = check_if_spam(post) assert is_spam is False # cleanup _remove_pickle("whitelistedUsers.p")
def check_blacklist(string_to_test, is_username, is_watchlist): # Test the string and provide a warning message if it is already caught. if is_username: question = Post(api_response={'title': 'Valid title', 'body': 'Valid body', 'owner': {'display_name': string_to_test, 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': False, 'score': 0}) answer = Post(api_response={'title': 'Valid title', 'body': 'Valid body', 'owner': {'display_name': string_to_test, 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': True, 'score': 0}) else: question = Post(api_response={'title': 'Valid title', 'body': string_to_test, 'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': False, 'score': 0}) answer = Post(api_response={'title': 'Valid title', 'body': string_to_test, 'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': True, 'score': 0}) question_reasons, _ = FindSpam.test_post(question) answer_reasons, _ = FindSpam.test_post(answer) # Filter out duplicates reasons = list(set(question_reasons) | set(answer_reasons)) # Filter out watchlist results if not is_watchlist: reasons = list(filter(lambda reason: "potentially bad keyword" not in reason, reasons)) return reasons
def updater(bot, domain, last_id): log.info('[VK] Проверка на наличие новых постов в {0} с последним ID {1}'.format(domain, last_id)) posts = get_data(domain) for post in reversed(posts): if post['id'] > last_id: log.info("[VK] Обнаружен новый пост с ID {0}".format(post['id'])) new_post = Post(post, domain) new_post.generate_post() send_post(bot, domain, new_post) last_id = update_parameter(domain, 'last_id', post['id']) time.sleep(5) if post['id'] == last_id: log.info('[VK] Новых постов больше не обнаружено') log.info('[VK] Проверка завершена, last_id = {0}.'.format(last_id))
def delete_post(postid): with dbapi2._connect(current_app.config['dsn']) as connection: cursor = connection.cursor() query = """SELECT u.ID, m.MOVIEID, COMMENTS FROM USERS u INNER JOIN POSTS p ON (u.ID = p.USER_ID) INNER JOIN MOVIES m ON (m.MOVIEID = p.MOVIE_ID) WHERE (p.POST_ID = %s)""" cursor.execute(query, (postid, )) post = cursor.fetchone() post_to_delete = Post(post[0], post[1], post[2]) post_to_delete.delete_post_from_db() connection.commit() return redirect(url_for('page.profile_page'))
def test_regexes(title, body, username, site, body_is_summary, is_answer, match): # If we want to test answers separately, this should be changed # is_answer = False post = Post( api_response={ 'title': title, 'body': body, 'owner': { 'display_name': username, 'reputation': 1, 'link': '' }, 'site': site, 'question_id': '1', 'IsAnswer': is_answer, 'BodyIsSummary': body_is_summary, 'score': 0 }) result = FindSpam.test_post(post)[0] log('info', title) log('info', "Result:", result) isspam = False if len(result) > 0: isspam = True if match != isspam: print((body, match)) assert match == isspam
def mock_post(cls, title='', body='', site='stackoverflow.com', link='https://stackoverflow.com/q/1732454', owner={'link': 'https://stackoverflow.com/users/102937/robert-harvey'}, post_id=1732454, is_question=True, increment_auto_post_id=True): if increment_auto_post_id: cls.auto_post_id += 1 link = 'https://stackoverflow.com/{}/{}'.format('q' if is_question else 'a', cls.auto_post_id) post_id = cls.auto_post_id api_response = { "title": title, "body": body, "site": site, "link": link, "owner": owner } if is_question: api_response['question_id'] = post_id else: api_response['answer_id'] = post_id return Post(api_response=api_response)
def new_post( self, user_id=None, date_post=None, status=None, posting=None, type=None, visibility=None ): user_post = UserPost() user_post.id_user = user_id user_post.date_post = date_post user_post.status = status post = Post() post.post = posting post.type = type post.visibility = visibility user_post.post = post self.session.add( user_post ) self.session.commit() return user_post
def test(content, alias_used="test"): """ Test an answer to determine if it'd be automatically reported :param content: :return: A string """ result = "> " if alias_used == "test-q": kind = " question." fakepost = Post(api_response={'title': 'Valid title', 'body': content, 'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': False, 'score': 0}) elif alias_used == "test-a": kind = "n answer." fakepost = Post(api_response={'title': 'Valid title', 'body': content, 'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': True, 'score': 0}) elif alias_used == "test-u": kind = " username." fakepost = Post(api_response={'title': 'Valid title', 'body': "Valid question body", 'owner': {'display_name': content, 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': False, 'score': 0}) elif alias_used == "test-t": kind = " title." fakepost = Post(api_response={'title': content, 'body': "Valid question body", 'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': False, 'score': 0}) else: kind = " post, title or username." fakepost = Post(api_response={'title': content, 'body': content, 'owner': {'display_name': content, 'reputation': 1, 'link': ''}, 'site': "", 'IsAnswer': False, 'score': 0}) reasons, why_response = FindSpam.test_post(fakepost) if len(reasons) == 0: result += "Would not be caught as a{}".format(kind) else: result += ", ".join(reasons).capitalize() if why_response is not None and len(why_response) > 0: result += "\n----------\n" result += why_response return result
def admFeed(): usuarioAtivo = current_user.get_id() db = utilitariosDB.getDb() sideBar = Markup(getSideBar()) usuario = db['usuarios'].find_one({ "_id": ObjectId(usuarioAtivo), "administrador": "S" }) if usuario == None: return "usuários não autorizado" navbar = Markup(getNavBar(usuario)) if request.method == 'POST': post = Post() post.titulo = request.form.get("feedInTitulo", '') post.descricao = request.form.get("feedInMensagem", '') post.momento = datetime.now() # Salvando no banco db = utilitariosDB.getDb() post.salvarMongoDb(db) return render_template("admFeed.html", sideBarWS=sideBar, navbarWS=navbar) else: return render_template("admFeed.html", sideBarWS=sideBar, navbarWS=navbar)
def get_user_posts(username: str, next_max_id: str = None) -> dict: """ Get a list of posts :param username: the user :param next_max_id: used for requiring next posts :return: {next_max_id, List[Posts]} """ url = "user_posts" querystring = {"username": username} if next_max_id is not None: querystring["next_max_id"] = next_max_id response = _request(url, json.dumps(querystring)) if response["statusCode"] != 404: return response["status"] posts_json = response["body"]["items"] posts = [] for post_json in posts_json: try: source = post_json["video_versions"][0]["url"] views = post_json["view_count"] is_video = True except: source = post_json["image_versions2"]["candidates"][0]["url"] views = None is_video = False caption = post_json["caption"]["text"] if post_json["caption"] is not None else "" taken_at = datetime.fromtimestamp(post_json["taken_at"]) new_post = Post( post_json["code"], source, post_json["like_count"], post_json["comment_count"], caption, is_video, views, taken_at ) posts.append(new_post) to_return = {"post_list": posts} if "next_max_id" in response["body"]: to_return["next_max_id"] = response["body"]["next_max_id"] return to_return
def test_post_parse_errors(): from classes import Post, PostParseError failure = None try: failure = Post() assert 'Post with no initializer did not fail' is False except PostParseError: pass assert failure is None
def test_post_parse_errors(): from classes import Post, PostParseError failure = None try: failure = Post() assert 'Post with no initializer did not fail.' == 'An exception should have been generated and caught.' except PostParseError: pass assert failure is None
def check_if_spam_json(json_data): try: post = Post(json_data=json_data) except PostParseError as err: log('error', 'Parse error {0} when parsing json_data {1!r}'.format( err, json_data)) return False, '', '' is_spam, reason, why = check_if_spam(post) return is_spam, reason, why
def feedInsert(): if request.method == 'POST': post = Post() post.titulo = request.form.get("feedInTitulo", '') post.descricao = request.form.get("feedInMensagem", '') post.midiaTipo = "imgB64" post.midia = request.form.get("fileBase64", '') post.momento = datetime.now() # Salvando no banco db = utilitariosDB.getDb() post.salvarMongoDb(db) return redirect("/feed") else: return render_template("feedInsert.html")
def test_blacklisted_user(): user_url = 'http://stackoverflow.com/users/1/jeff-atwood' user = get_user_from_url(user_url) add_blacklisted_user(user, "", "") # Construct a "fake" post object in API-format post = Post(api_response={'title': '', 'body': '', 'owner': {'display_name': user, 'reputation': 1, 'link': user_url}, 'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0}) is_spam, reason, _ = check_if_spam(post) assert is_spam is True # cleanup _remove_pickle("blacklistedUsers.p")
def test_findspam(title, body, username, site, body_is_summary, is_answer, expected_spam): post = Post(api_response={'title': title, 'body': body, 'owner': {'display_name': username, 'reputation': 1, 'link': ''}, 'site': site, 'question_id': '1', 'IsAnswer': is_answer, 'BodyIsSummary': body_is_summary, 'score': 0}) result = FindSpam.test_post(post)[0] log('info', title) log('info', "Result:", result) scan_spam = (len(result) > 0) if scan_spam != expected_spam: print("Expected {1} on {0}".format(body, expected_spam)) assert scan_spam == expected_spam
def mock_post( title='', body='', site='stackoverflow.com', link='https://stackoverflow.com/a/1732454', owner={'link': 'https://stackoverflow.com/users/102937/robert-harvey'}): api_response = { "title": title, "body": body, "site": site, "link": link, "owner": owner } return Post(api_response=api_response)
def test_check_if_spam(title, body, username, site, match): # We can't check blacklists/whitelists in tests, so these are set to their default values post_dict = { "titleEncodedFancy": str(title), "bodySummary": str(body), "ownerDisplayName": str(username), "url": "TEST: No URL passed!", "id": "TEST: No ID passed!", "siteBaseHostAddress": str(site), "ownerUrl": "TEST: No Owner ID passed!" } json_dict = { "action": "155-questions-active", 'data': json.dumps(post_dict), 'IsAnswer': False # If we want to test answers separately, this should be changed. } json_data = json.dumps(json_dict) post = Post(json_data=json_data) is_spam, reason, _ = check_if_spam(post) assert match == is_spam
def movies_page(): if request.method == "POST": movie = Movie(request.form['title'].title(), "", "", "", "") score = request.form['score'] comments = request.form['comment'] if int(score) < 1 or int(score) > 10: flash("Your rating to the movie should be between 1 and 10.") return redirect(url_for('page.movies_page')) #checks if user is logged in if current_user.get_id() is not None: if (movie.search_movie_in_db() != -1): movieId = movie.search_movie_in_db() userMoviePair = WatchedList(current_user.username, movieId, score) post = Post(current_user.get_user_id(), movieId, comments) oldscore = userMoviePair.existsInWatchedList() if (oldscore != -1): oldscore = oldscore[0] if int(oldscore) == int(score): flash("You have already added " + movie.title + ".") return redirect(url_for('page.home_page')) else: userMoviePair.updateScoreOfWatchedMovie() oldScoreMoviesTable = int( movie.getscore_in_movie_db(movieId)[0]) totalVotes = int( movie.getvotes_in_movie_db(movieId)[0]) newscore = ((oldScoreMoviesTable * totalVotes) - int(oldscore) + int(score)) / (totalVotes) movie.update_votes_and_score(movieId, newscore, totalVotes) flash("You score to " + movie.title + " is updated as " + score + ".") return redirect(url_for('page.home_page')) else: userMoviePair.add_movie_user_pair() #score and vote need to be updated on movies table oldscore = int(movie.getscore_in_movie_db(movieId)[0]) totalVotes = int(movie.getvotes_in_movie_db(movieId)[0]) newscore = ((oldscore * totalVotes) + int(score)) / (totalVotes + 1) totalVotes = totalVotes + 1 movie.update_votes_and_score(movieId, newscore, totalVotes) post.add_post_to_db() flash( movie.title + " is added to your watched list and your post has been saved." ) return redirect(url_for('page.home_page')) else: movieToAdd = movie.verify_movie_from_api() if (movieToAdd == -1): flash("There is no such movie") return redirect(url_for('page.home_page')) else: movieToAdd = movie.verify_movie_from_api() movieToAdd.score = score movieToAdd.add_movie_to_db() flash( movieToAdd.title + " (" + movieToAdd.year + ") is added to your watched list and your post has been saved." ) movieId = movieToAdd.search_movie_in_db() userMoviePair = WatchedList(current_user.username, movieId, score) userMoviePair.add_movie_user_pair() post = Post(current_user.get_user_id(), movieId, comments) post.add_post_to_db() return redirect(url_for('page.home_page')) else: flash("Please log in to MovieShake") return redirect(url_for('page.login_page')) else: if current_user.get_id() is not None: return render_template('movies.html') else: flash("Please log in to MovieShake") return redirect(url_for('page.login_page'))
from classes import Post, Collection from selenium import webdriver # Open browser driver = webdriver.Chrome() # # Create sample post # post = Post(65313) # post.get_text(driver) # print(post.text) # Create collection collection = Collection() for tag in range(65799, -1, -1): post = Post(tag) post.get_text(driver) print("Tag:", tag, '-', post.has_text()) collection.insert(post) if tag % 100 == 0: # Save collection collection.to_df() collection.to_csv('collection' + str(tag // 100) + '.csv') collection.to_excel('collection' + str(tag // 100) + '.xlsx') collection = Collection() # Close browser driver.close()
def add_post(post_ch, post_txt, post_user, post_time): new_post = Post(post_ch, post_txt, post_user, post_time) emit("add_new_post", new_post.get_post_dict(post_ch), broadcast=True)
def make_api_call_for_site(self, site): if site not in self.queue: return self.queue_modify_lock.acquire() new_posts = self.queue.pop(site) store_bodyfetcher_queue() self.queue_modify_lock.release() new_post_ids = [int(k) for k, v in new_posts.items()] if GlobalVars.flovis is not None: for post_id in new_post_ids: GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id, {'queue': dict([[sk, [k for k, v in sq.items()]] for sk, sq in self.queue.items()]), 'site': site, 'posts': [k for k, v in new_posts.items()]}) self.queue_timing_modify_lock.acquire() post_add_times = [v for k, v in new_posts.items()] pop_time = datetime.utcnow() for add_time in post_add_times: try: seconds_in_queue = (pop_time - add_time).total_seconds() if site in self.queue_timings: self.queue_timings[site].append(seconds_in_queue) else: self.queue_timings[site] = [seconds_in_queue] except: continue # Skip to next item if we've got invalid data or missing values. store_queue_timings() self.queue_timing_modify_lock.release() self.max_ids_modify_lock.acquire() if site in self.previous_max_ids and max(new_post_ids) > self.previous_max_ids[site]: previous_max_id = self.previous_max_ids[site] intermediate_posts = range(previous_max_id + 1, max(new_post_ids)) # We don't want to go over the 100-post API cutoff, so take the last # (100-len(new_post_ids)) from intermediate_posts intermediate_posts = intermediate_posts[(100 - len(new_post_ids)):] # new_post_ids could contain edited posts, so merge it back in combined = chain(intermediate_posts, new_post_ids) # Could be duplicates, so uniquify posts = list(set(combined)) else: posts = new_post_ids try: if max(new_post_ids) > self.previous_max_ids[site]: self.previous_max_ids[site] = max(new_post_ids) store_bodyfetcher_max_ids() except KeyError: self.previous_max_ids[site] = max(new_post_ids) store_bodyfetcher_max_ids() self.max_ids_modify_lock.release() log('debug', "New IDs / Hybrid Intermediate IDs for {0}:".format(site)) log('debug', sorted(new_post_ids)) log('debug', sorted(posts)) question_modifier = "" pagesize_modifier = "" if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. if self.last_activity_date != 0: pagesize = "50" else: pagesize = "25" pagesize_modifier = "&pagesize={pagesize}" \ "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date)) else: question_modifier = "/{0}".format(";".join(str(post) for post in posts)) url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \ "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \ "{optional_min_query_param}".format(q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) GlobalVars.api_request_lock.acquire() # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: time_request_made = datetime.now().strftime('%H:%M:%S') response = requests.get(url, timeout=20).json() except (requests.exceptions.Timeout, requests.ConnectionError, Exception): # Any failure in the request being made (timeout or otherwise) should be added back to # the queue. self.queue_modify_lock.acquire() if site in self.queue: self.queue[site].update(new_posts) else: self.queue[site] = new_posts self.queue_modify_lock.release() GlobalVars.api_request_lock.release() return self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0: tell_rooms_with("debug", "API quota rolled over with {0} requests remaining. " "Current quota: {1}.".format(GlobalVars.apiquota, response["quota_remaining"])) sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '') api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip() tell_rooms_with("debug", api_quota_used_per_site) clear_api_data() if response["quota_remaining"] == 0: tell_rooms_with("debug", "API reports no quota left! May be a glitch.") tell_rooms_with("debug", str(response)) # No code format for now? if GlobalVars.apiquota == -1: tell_rooms_with("debug", "Restart: API quota is {quota}." .format(quota=response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made) if "error_id" in response and response["error_id"] == 502: if GlobalVars.api_backoff_time < time.time() + 12: # Add a backoff of 10 + 2 seconds as a default GlobalVars.api_backoff_time = time.time() + 12 message_hq += " Backing off on requests for the next 12 seconds." message_hq += " Previous URL: `{}`".format(url) if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] GlobalVars.api_request_lock.release() if len(message_hq) > 0: tell_rooms_with("debug", message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: self.last_activity_date = items[0]["last_activity_date"] num_scanned = 0 start_time = time.time() for post in response["items"]: if "title" not in post or "body" not in post: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/no_content', site, post['question_id'], post) continue post['site'] = site try: post_ = Post(api_response=post) except PostParseError as err: log('error', 'Error {0} when parsing post: {1!r}'.format(err, post_)) if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/error', site, post['question_id'], post) continue num_scanned += 1 is_spam, reason, why = check_if_spam(post_) if is_spam: try: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, post['question_id'], {'post': post, 'check_if_spam': [is_spam, reason, why]}) handle_spam(post=post_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, post['question_id'], {'post': post, 'check_if_spam': [is_spam, reason, why]}) try: if "answers" not in post: pass else: for answer in post["answers"]: num_scanned += 1 answer["IsAnswer"] = True # Necesssary for Post object answer["title"] = "" # Necessary for proper Post object creation answer["site"] = site # Necessary for proper Post object creation answer_ = Post(api_response=answer, parent=post_) is_spam, reason, why = check_if_spam(answer_) if is_spam: try: if GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, answer['answer_id'], {'post': answer, 'check_if_spam': [is_spam, reason, why]}) handle_spam(answer_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, answer['answer_id'], {'post': answer, 'check_if_spam': [is_spam, reason, why]}) except Exception as e: log('error', "Exception handling answers:", e) end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() GlobalVars.num_posts_scanned += num_scanned GlobalVars.post_scan_time += end_time - start_time GlobalVars.posts_scan_stats_lock.release() return
def allspam(msg, url): """ Reports all of a user's posts as spam :param msg: :param url: A user profile URL :return: """ crn, wait = can_report_now(msg.owner.id, msg._client.host) if not crn: raise CmdException("You can execute the !!/allspam command again in {} seconds. " "To avoid one user sending lots of reports in a few commands and " "slowing SmokeDetector down due to rate-limiting, you have to " "wait 30 seconds after you've reported multiple posts in " "one go.".format(wait)) user = get_user_from_url(url) if user is None: raise CmdException("That doesn't look like a valid user URL.") user_sites = [] user_posts = [] # Detect whether link is to network profile or site profile if user[1] == 'stackexchange.com': # Respect backoffs etc GlobalVars.api_request_lock.acquire() if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) # Fetch sites api_filter = "!6Pbp)--cWmv(1" request_url = "http://api.stackexchange.com/2.2/users/{}/associated?filter={}&key=IAkbitmze4B8KpacUfLqkw((" \ .format(user[0], api_filter) res = requests.get(request_url).json() if "backoff" in res: if GlobalVars.api_backoff_time < time.time() + res["backoff"]: GlobalVars.api_backoff_time = time.time() + res["backoff"] GlobalVars.api_request_lock.release() if 'items' not in res or len(res['items']) == 0: raise CmdException("The specified user does not appear to exist.") if res['has_more']: raise CmdException("The specified user has an abnormally high number of accounts. Please consider flagging " "for moderator attention, otherwise use !!/report on the user's posts individually.") # Add accounts with posts for site in res['items']: if site['question_count'] > 0 or site['answer_count'] > 0: user_sites.append((site['user_id'], get_api_sitename_from_url(site['site_url']))) else: user_sites.append((user[0], get_api_sitename_from_url(user[1]))) # Fetch posts for u_id, u_site in user_sites: # Respect backoffs etc GlobalVars.api_request_lock.acquire() if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) # Fetch posts api_filter = "!)Q4RrMH0DC96Y4g9yVzuwUrW" request_url = "http://api.stackexchange.com/2.2/users/{}/posts?site={}&filter={}&key=IAkbitmze4B8KpacUfLqkw((" \ .format(u_id, u_site, api_filter) res = requests.get(request_url).json() if "backoff" in res: if GlobalVars.api_backoff_time < time.time() + res["backoff"]: GlobalVars.api_backoff_time = time.time() + res["backoff"] GlobalVars.api_request_lock.release() if 'items' not in res or len(res['items']) == 0: raise CmdException("The specified user has no posts on this site.") posts = res['items'] if posts[0]['owner']['reputation'] > 100: raise CmdException("The specified user's reputation is abnormally high. Please consider flagging for " "moderator attention, otherwise use !!/report on the posts individually.") # Add blacklisted user - use most downvoted post as post URL message_url = "https://chat.{}/transcript/{}?m={}".format(msg._client.host, msg.room.id, msg.id) add_blacklisted_user(user, message_url, sorted(posts, key=lambda x: x['score'])[0]['owner']['link']) # TODO: Postdata refactor, figure out a better way to use apigetpost for post in posts: post_data = PostData() post_data.post_id = post['post_id'] post_data.post_url = url_to_shortlink(post['link']) *discard, post_data.site, post_data.post_type = fetch_post_id_and_site_from_url( url_to_shortlink(post['link'])) post_data.title = unescape(post['title']) post_data.owner_name = unescape(post['owner']['display_name']) post_data.owner_url = post['owner']['link'] post_data.owner_rep = post['owner']['reputation'] post_data.body = post['body'] post_data.score = post['score'] post_data.up_vote_count = post['up_vote_count'] post_data.down_vote_count = post['down_vote_count'] if post_data.post_type == "answer": # Annoyingly we have to make another request to get the question ID, since it is only returned by the # /answers route # Respect backoffs etc GlobalVars.api_request_lock.acquire() if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) # Fetch posts filter = "!*Jxb9s5EOrE51WK*" req_url = "http://api.stackexchange.com/2.2/answers/{}?site={}&filter={}&key=IAkbitmze4B8KpacUfLqkw((" \ .format(post['post_id'], u_site, filter) answer_res = requests.get(req_url).json() if "backoff" in res: if GlobalVars.api_backoff_time < time.time() + res["backoff"]: GlobalVars.api_backoff_time = time.time() + res["backoff"] GlobalVars.api_request_lock.release() # Finally, set the attribute post_data.question_id = answer_res['items'][0]['question_id'] post_data.is_answer = True user_posts.append(post_data) if len(user_posts) == 0: raise CmdException("The specified user hasn't posted anything.") if len(user_posts) > 15: raise CmdException("The specified user has an abnormally high number of spam posts. Please consider flagging " "for moderator attention, otherwise use !!/report on the posts individually.") why_info = u"User manually reported by *{}* in room *{}*.\n".format(msg.owner.name, msg.room.name) # Handle all posts for index, post in enumerate(user_posts, start=1): batch = "" if len(user_posts) > 1: batch = " (batch report: post {} out of {})".format(index, len(user_posts)) handle_spam(post=Post(api_response=post.as_dict), reasons=["Manually reported " + post.post_type + batch], why=why_info) time.sleep(2) # Should this be implemented differently? if len(user_posts) > 2: add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time())
def make_api_call_for_site(self, site): with self.queue_lock: new_posts = self.queue.pop(site, None) if new_posts is None: # site was not in the queue return Tasks.do(store_bodyfetcher_queue) new_post_ids = [int(k) for k in new_posts.keys()] if GlobalVars.flovis is not None: for post_id in new_post_ids: GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id, { 'site': site, 'posts': list(new_posts.keys()) }) # Add queue timing data pop_time = datetime.utcnow() post_add_times = [(pop_time - v).total_seconds() for k, v in new_posts.items()] Tasks.do(add_queue_timing_data, site, post_add_times) store_max_ids = False with self.max_ids_modify_lock: if site in self.previous_max_ids and max( new_post_ids) > self.previous_max_ids[site]: previous_max_id = self.previous_max_ids[site] intermediate_posts = range(previous_max_id + 1, max(new_post_ids)) # We don't want to go over the 100-post API cutoff, so take the last # (100-len(new_post_ids)) from intermediate_posts intermediate_posts = intermediate_posts[-(100 - len(new_post_ids)):] # new_post_ids could contain edited posts, so merge it back in combined = chain(intermediate_posts, new_post_ids) # Could be duplicates, so uniquify posts = list(set(combined)) else: posts = new_post_ids new_post_ids_max = max(new_post_ids) if new_post_ids_max > self.previous_max_ids.get(site, 0): self.previous_max_ids[site] = new_post_ids_max store_max_ids = True if store_max_ids: schedule_store_bodyfetcher_max_ids() log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site)) if len(new_post_ids) > 30: log( 'debug', "{} +{} more".format( sorted(new_post_ids)[:30], len(new_post_ids) - 30)) else: log('debug', sorted(new_post_ids)) if len(new_post_ids) == len(posts): log('debug', "[ *Identical* ]") elif len(posts) > 30: log('debug', "{} +{} more".format(sorted(posts)[:30], len(posts) - 30)) else: log('debug', sorted(posts)) question_modifier = "" pagesize_modifier = {} if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. with self.last_activity_date_lock: if self.last_activity_date != 0: pagesize = "100" else: pagesize = "50" pagesize_modifier = { 'pagesize': pagesize, 'min': str(self.last_activity_date - self.ACTIVITY_DATE_EXTRA_EARLIER_MS_TO_FETCH) } else: question_modifier = "/{0}".format(";".join( [str(post) for post in posts])) url = "https://api.stackexchange.com/2.2/questions{}".format( question_modifier) params = { 'filter': '!1rs)sUKylwB)8isvCRk.xNu71LnaxjnPS12*pX*CEOKbPFwVFdHNxiMa7GIVgzDAwMa', 'key': 'IAkbitmze4B8KpacUfLqkw((', 'site': site } params.update(pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) with GlobalVars.api_request_lock: # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: time_request_made = datetime.utcnow().strftime('%H:%M:%S') response = requests.get(url, params=params, timeout=20).json() except (requests.exceptions.Timeout, requests.ConnectionError, Exception): # Any failure in the request being made (timeout or otherwise) should be added back to # the queue. with self.queue_lock: if site in self.queue: self.queue[site].update(new_posts) else: self.queue[site] = new_posts return with self.api_data_lock: add_or_update_api_data(site) message_hq = "" with GlobalVars.apiquota_rw_lock: if "quota_remaining" in response: quota_remaining = response["quota_remaining"] if quota_remaining - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0 \ and quota_remaining > 39980: tell_rooms_with( "debug", "API quota rolled over with {0} requests remaining. " "Current quota: {1}.".format( GlobalVars.apiquota, quota_remaining)) sorted_calls_per_site = sorted( GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: sanatized_site_name = site_name.replace( '.com', '').replace('.stackexchange', '') api_quota_used_per_site += sanatized_site_name + ": {0}\n".format( str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip( ) tell_rooms_with("debug", api_quota_used_per_site) clear_api_data() if quota_remaining == 0: tell_rooms_with( "debug", "API reports no quota left! May be a glitch.") tell_rooms_with( "debug", str(response)) # No code format for now? if GlobalVars.apiquota == -1: tell_rooms_with( "debug", "Restart: API quota is {quota}.".format( quota=quota_remaining)) GlobalVars.apiquota = quota_remaining else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {} at {} UTC.".format( response["error_message"], time_request_made) if "error_id" in response and response["error_id"] == 502: if GlobalVars.api_backoff_time < time.time( ) + 12: # Add a backoff of 10 + 2 seconds as a default GlobalVars.api_backoff_time = time.time() + 12 message_hq += " Backing off on requests for the next 12 seconds." message_hq += " Previous URL: `{}`".format(url) if "backoff" in response: if GlobalVars.api_backoff_time < time.time( ) + response["backoff"]: GlobalVars.api_backoff_time = time.time( ) + response["backoff"] if len(message_hq) > 0 and "site is required" not in message_hq: message_hq = message_hq.strip() if len(message_hq) > 500: message_hq = "\n" + message_hq tell_rooms_with("debug", message_hq) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: with self.last_activity_date_lock: self.last_activity_date = items[0]["last_activity_date"] num_scanned = 0 start_time = time.time() for post in response["items"]: if GlobalVars.flovis is not None: pnb = copy.deepcopy(post) if 'body' in pnb: pnb['body'] = 'Present, but truncated' if 'answers' in pnb: del pnb['answers'] if "title" not in post or "body" not in post: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/no_content', site, post['question_id'], pnb) continue post['site'] = site try: post['edited'] = (post['creation_date'] != post['last_edit_date']) except KeyError: post[ 'edited'] = False # last_edit_date not present = not edited question_doesnt_need_scan = is_post_recently_scanned_and_unchanged( post) add_recently_scanned_post(post) if not question_doesnt_need_scan: try: post_ = Post(api_response=post) except PostParseError as err: log( 'error', 'Error {0} when parsing post: {1!r}'.format( err, post_)) if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/error', site, post['question_id'], pnb) continue num_scanned += 1 is_spam, reason, why = check_if_spam(post_) if is_spam: try: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/spam', site, post['question_id'], { 'post': pnb, 'check_if_spam': [is_spam, reason, why] }) handle_spam(post=post_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/not_spam', site, post['question_id'], { 'post': pnb, 'check_if_spam': [is_spam, reason, why] }) try: if "answers" not in post: pass else: for answer in post["answers"]: if GlobalVars.flovis is not None: anb = copy.deepcopy(answer) if 'body' in anb: anb['body'] = 'Present, but truncated' num_scanned += 1 answer["IsAnswer"] = True # Necesssary for Post object answer[ "title"] = "" # Necessary for proper Post object creation answer[ "site"] = site # Necessary for proper Post object creation try: answer['edited'] = (answer['creation_date'] != answer['last_edit_date']) except KeyError: answer[ 'edited'] = False # last_edit_date not present = not edited answer_doesnt_need_scan = is_post_recently_scanned_and_unchanged( answer) add_recently_scanned_post(answer) if answer_doesnt_need_scan: continue answer_ = Post(api_response=answer, parent=post_) is_spam, reason, why = check_if_spam(answer_) if is_spam: try: if GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage( 'bodyfetcher/api_response/spam', site, answer['answer_id'], { 'post': anb, 'check_if_spam': [is_spam, reason, why] }) handle_spam(answer_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage( 'bodyfetcher/api_response/not_spam', site, answer['answer_id'], { 'post': anb, 'check_if_spam': [is_spam, reason, why] }) except Exception as e: log('error', "Exception handling answers:", e) end_time = time.time() scan_time = end_time - start_time GlobalVars.PostScanStat.add_stat(num_scanned, scan_time) return
def report(msg, urls): """ Report a post (or posts) :param msg: :param urls: :return: A string (or None) """ crn, wait = can_report_now(msg.owner.id, msg._client.host) if not crn: raise CmdException("You can execute the !!/report command again in {} seconds. " "To avoid one user sending lots of reports in a few commands and " "slowing SmokeDetector down due to rate-limiting, you have to " "wait 30 seconds after you've reported multiple posts in " "one go.".format(wait)) output = [] urls = list(set(urls.split())) if len(urls) > 5: raise CmdException("To avoid SmokeDetector reporting posts too slowly, you can " "report at most 5 posts at a time. This is to avoid " "SmokeDetector's chat messages getting rate-limited too much, " "which would slow down reports.") for index, url in enumerate(urls, start=1): post_data = api_get_post(url) if post_data is None: output.append("Post {}: That does not look like a valid post URL.".format(index)) continue if post_data is False: output.append("Post {}: Could not find data for this post in the API. " "It may already have been deleted.".format(index)) continue if has_already_been_posted(post_data.site, post_data.post_id, post_data.title) and not is_false_positive( (post_data.post_id, post_data.site)): # Don't re-report if the post wasn't marked as a false positive. If it was marked as a false positive, # this re-report might be attempting to correct that/fix a mistake/etc. if GlobalVars.metasmoke_key is not None: se_link = to_protocol_relative(post_data.post_url) ms_link = "https://m.erwaysoftware.com/posts/by-url?url={}".format(se_link) output.append("Post {}: Already recently reported [ [MS]({}) ]".format(index, ms_link)) continue else: output.append("Post {}: Already recently reported".format(index)) continue post_data.is_answer = (post_data.post_type == "answer") post = Post(api_response=post_data.as_dict) user = get_user_from_url(post_data.owner_url) if user is not None: message_url = "https://chat.{}/transcript/{}?m={}".format(msg._client.host, msg.room.id, msg.id) add_blacklisted_user(user, message_url, post_data.post_url) why_info = u"Post manually reported by user *{}* in room *{}*.\n".format(msg.owner.name, msg.room.name) batch = "" if len(urls) > 1: batch = " (batch report: post {} out of {})".format(index, len(urls)) handle_spam(post=post, reasons=["Manually reported " + post_data.post_type + batch], why=why_info) if 1 < len(urls) > len(output): add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time()) if len(output) > 0: return os.linesep.join(output)