def add_to_db(adder, comments, verified, verified_user_id, is_rude): for parsed_args in comments: params = CSVDataUploader.make_site_comment_params( parsed_args, verified, verified_user_id, is_rude) if SiteComment.is_exist(adder, params.get('comment_id')): continue adder.add(SiteComment(params))
def play(): rude_comments = SiteComment.rude_comments() normal_comments = SiteComment.normal_comments() rude_words_wiki = WiktionaryOrg.humiliation_words() total_rude = 0 total_normal = 0 for comment in rude_comments: if len(comment.processed_body.split(' ')) == 0: continue words = [ word for word in comment.processed_body.split(u' ') if word in rude_words_wiki ] if len(words) > 0: total_rude += 1 # print("[Words: %s] ([%s]) || %s\r\n" % ( str(u' '.join(words)), str(u' '.join(comment.processed_body.split(u' '))), str(comment.body) )) for comment in normal_comments: if len(comment.processed_body.split(' ')) == 0: continue words = [ word for word in comment.processed_body.split(u' ') if word in rude_words_wiki ] if len(words) > 0: total_normal += 1 # print("[Words: %s] || %s\r\n" % ( str(words), str(comment.body) )) print("Total rude %s, total normal %s, comment [%s]" % (str(total_rude), str(total_normal), str(comment.body)))
def api_features(): if g.user is None: abort(404) x = int(request.args.get("x", -1)) if x < 0: abort(404) if CURRENT_MODEL != MODEL_LOGISITIC_REGRESSION: abort(404) rude_comments = SiteComment.rude_comments() normal_comments = SiteComment.normal_comments() def get_data(comments, feature, label): data = list() for comment in comments: feature_value = SiteCommentFeatures.manual_feature_value( comment, feature) data.append({"x": feature_value, "label": label}) return data positive_data = get_data(rude_comments, x, SiteCommentFeatures.RUDE_CLASS) negative_data = get_data(normal_comments, x, SiteCommentFeatures.NORMAL_CLASS) return jsonify( **{ "x_name": SiteCommentFeatures.feature_desc(x), "positive": positive_data, "negative": negative_data })
def analyse_with_bayes_classifier(): rude_comments = SiteComment.rude_comments() normal_comments = SiteComment.normal_comments() classifier = BinaryNaiveBayesClassifier(True) classifier.train(rude_comments, normal_comments) classifier.print_params() return classifier
def actions_verify(comment_id): if g.user is None or g.user.role != "moderator": abort(404) if request.args.get("is_rude", None) is None: abort(404) is_rude = json.loads(request.args.get("is_rude").lower()) comment = SiteComment.by_comment_id(comment_id) if comment is None: abort(404) adder = DBModelAdder() adder.start() comment.verified = datetime.datetime.now() comment.is_rude = is_rude comment.verified_user_id = g.user.user_id comment.skipped = None adder.add(comment) adder.done() resp = { "status": True, "msg": "OK", "comment_id": comment_id, "is_rude": is_rude, "verified_user_id": g.user.user_id } return jsonify(**resp)
def analyse_comments(analysed_at=None): classifier = None if CURRENT_MODEL == MODEL_LOGISITIC_REGRESSION: classifier = analyse_with_logistic_regression() if classifier is None: print("Classifier is not set up. Set up classifier first.") return print("Model is ready. Starting analysis...") suspected = 0 adder = DBModelAdder() adder.start() comments_for_analysis = SiteComment.comments_for_analysis(analysed_at) for comment in comments_for_analysis: comment.analysed = datetime.datetime.now() comment.looks_rude = classifier.classify_rude(comment) adder.add(comment) if comment.looks_rude: suspected += 1 adder.done() print("Analysis was done for %s comments, %s suspected to be rude." % (str(len(comments_for_analysis)), str(suspected)))
def load_comments_from_se_to_db(): def make_site_comment_params(comment, info): comment_id, post_id, body, creation_date, author_id, author_name = comment question_id, answer_id, post_author_id, post_author_name, score, title, post_creation_date = info return { "comment_id": comment_id, "question_id": question_id, "answer_id": answer_id, "post_author_id": post_author_id, "post_score": score, "title": title, "body": body, "processed_body": process_text(body), "creation_date": creation_date, "author_id": author_id, "author_name": author_name, "verified": None, "is_rude": False, "diff_with_post": (creation_date - post_creation_date).total_seconds() } last_one = SiteComment.last_comment() comments = get_recent_comments(last_one.creation_date) infos = dict() ids = [comment[1] for comment in comments] page_size = 20 counter = 0 while counter < len(ids): req_ids = ids[counter:counter + page_size] info = get_post_infos(req_ids) infos.update(info) counter += page_size adder = DBModelAdder() adder.start() for comment in comments: if SiteComment.is_exist(adder, comment[0]): continue adder.add( SiteComment( make_site_comment_params(comment, infos.get(comment[1])))) adder.done()
def create_model(): if CURRENT_MODEL == MODEL_LOGISITIC_REGRESSION: feature_list = SiteCommentFeatureList(SiteComment.rude_comments(), SiteComment.normal_comments()) feature_maker = feature_list.maker() classifier = LogisticRegaression(feature_list, feature_maker, True) classifier.train() rude_total, rude_right, normal_total, normal_right = classifier.test( True) tpr = float(rude_right) / float(rude_total) tnr = float(normal_right) / float(normal_total) total_objects = float(rude_total + normal_total) acc = (rude_right / total_objects) * tpr + (normal_right / total_objects) * tnr print("Accuracy: %s, rude: %s (%s), normal: %s (%s) " % (str(acc), str(rude_right), str(rude_total), str(normal_right), str(normal_total))) adder = DBModelAdder() adder.start() feature_data = feature_maker.store() json_fd = JSONObjectData(JSONObjectData.FEATURE_TYPE_ID, json.dumps(feature_data)) adder.add(json_fd) classifier_data = classifier.store() classifier_extra = { "acc": acc, "rude_right": rude_right, "rude_total": rude_total, "normal_right": normal_right, "normal_total": normal_total } json_cd = JSONObjectData(JSONObjectData.LOGREG_TYPE_ID, json.dumps(classifier_data), json.dumps(classifier_extra)) adder.add(json_cd) adder.done() print("A new logistic regression classifier was added to the DB.") else: print("Please specify a model to create first.")
def verifying(): if g.user is None or g.user.role != "moderator": return redirect(url_for('index')) page = max(int(request.args.get("page", 1)), 1) paginator = SiteComment.paginate_unverified(page) return render_template('index.html', paginator=paginator, base_url=url_for("verifying"), so_url=SO_URL, active_tab="verifying")
def analyse_with_cosine(): stats = DocsStats() rude_comments = SiteComment.rude_comments() rude_docs = list() for comment in rude_comments: rude_docs.append( Document(stats, comment.id, comment.body, comment.processed_body)) unverified_comments = SiteComment.comments_for_analysis() unverified_docs = list() for comment in unverified_comments: unverified_docs.append( Document(stats, comment.id, comment.body, comment.processed_body)) stats.calculate_idfs() stats.vectorise_documents() cosine = CosineSimilarity(rude_docs) rude_cluster = cosine.biggest_cluster() for item in rude_cluster: print("- ", item.body, "\r\n")
def comment_feed(): limit = min(int(session.get("limit", 30)), 1000) comments = SiteComment.analysed_as_rude(limit) last_update = datetime.datetime.now() if len(comments) > 0: last_update = comments[0].analysed resp = make_response( render_template('feed_proto.xml', app_url=APP_URL, app_title=FEED_APP_TITLE, so_url=SO_URL, last_update=last_update, entries=comments)) resp.headers['Content-type'] = 'application/atom+xml; charset=utf-8' return resp
def dump_verified_comments(): directory = "./dump" if os.path.exists(directory): shutil.rmtree(directory) os.makedirs(directory) def dump_to(filename, comments): def date_or_none(field): return field.strftime( "%Y-%m-%d %H:%M:%S") if field is not None else None with open(filename, 'w', encoding="utf8") as csvfile: writer = csv.writer(csvfile, lineterminator='\n', delimiter=',') for comment in comments: writer.writerow([ comment.comment_id, comment.question_id, comment.answer_id, comment.post_author_id, comment.post_score, comment.title, comment.body, date_or_none(comment.creation_date), comment.author_id, comment.author_name, comment.diff_with_post, date_or_none(comment.verified), comment.is_rude, comment.verified_user_id, date_or_none(comment.added), date_or_none(comment.analysed), comment.looks_rude, date_or_none(comment.skipped) ]) rude_comments = SiteComment.rude_comments() dump_to(directory + "/rude_comments.csv", rude_comments) normal_comments = SiteComment.normal_comments() dump_to(directory + "/normal_comments.csv", normal_comments) skipped_comments = SiteComment.skipped_comments() dump_to(directory + "/skipped_comments.csv", skipped_comments)
def actions_skipp(comment_id): if g.user is None or g.user.role != "moderator": abort(404) comment = SiteComment.by_comment_id(comment_id) if comment is None: abort(404) adder = DBModelAdder() adder.start() comment.skipped = datetime.datetime.now() comment.verified = None comment.verified_user_id = -1 adder.add(comment) adder.done() resp = {"status": True, "msg": "OK", "comment_id": comment_id} return jsonify(**resp)
def check_to_rebuild(): saved_data = JSONObjectData.last(JSONObjectData.LOGREG_TYPE_ID) feature_saved_data = JSONObjectData.last(JSONObjectData.FEATURE_TYPE_ID) if saved_data is None or feature_saved_data is None: print("There are no saved data. Starting rebuilding...") create_model() print("Now, do analysis for previous comments with the new model...") analyse_comments(datetime.datetime.now()) return unseen_for_model = SiteComment.verified_after(saved_data.added) print( "There are currently %s comments which the model has not seen. The threshold is %s" % (str(unseen_for_model), str(REBUILD_MODEL_THRESHOLD))) if unseen_for_model >= REBUILD_MODEL_THRESHOLD: print("We are above the threshold. Starting rebuilding...") create_model() print("Now, do analysis for previous comments with the new model...") analyse_comments(datetime.datetime.now()) return print("No reason to rebuild. We will wait a bit more.")
def add(adder, comments): for comment in comments: if SiteComment.is_exist(adder, comment.get("comment_id")): continue adder.add(SiteComment(comment))