def train_svm(documents, ntesting=500): """ :param documents- politeness-annotated training data :type documents- list of dicts each document must be preprocessed and 'sentences' and 'parses' and 'score' fields. :param ntesting- number of docs to reserve for testing :type ntesting- int returns fitted SVC, which can be serialized using cPickle """ # Generate and persist list of unigrams, bigrams PolitenessFeatureVectorizer.generate_bow_features(documents) # For good luck random.shuffle(documents) testing = documents[-ntesting:] documents = documents[:-ntesting] # SAVE FOR NOW cPickle.dump(testing, open("testing-data.p", 'w')) X, y = documents2feature_vectors(documents) Xtest, ytest = documents2feature_vectors(testing) print "Fitting" clf = svm.SVC(C=0.02, kernel='linear', probability=True) clf.fit(X, y) # Test y_pred = clf.predict(Xtest) print(classification_report(ytest, y_pred)) return clf
def train_svm(documents, ntesting=500): """ :param documents- politeness-annotated training data :type documents- list of dicts each document must be preprocessed and 'sentences' and 'parses' and 'score' fields. :param ntesting- number of docs to reserve for testing :type ntesting- int returns fitted SVC, which can be serialized using cPickle """ # Generate and persist list of unigrams, bigrams PolitenessFeatureVectorizer.generate_bow_features(documents) # For good luck random.shuffle(documents) testing = documents[-ntesting:] documents = documents[:-ntesting] # SAVE FOR NOW pickle.dump(testing, open("model_svm.pkl", 'wb')) X, y = documents2feature_vectors(documents) Xtest, ytest = documents2feature_vectors(testing) print("Fitting") clf = svm.SVC(C=0.02, kernel='linear', probability=True) clf.fit(X, y) # Test y_pred = clf.predict(Xtest) print(classification_report(ytest, y_pred)) return clf
def documents2feature_vectors(documents): vectorizer = PolitenessFeatureVectorizer() fks = False X, y = [], [] for d in documents: fs = vectorizer.features(d) if not fks: fks = sorted(fs.keys()) fv = [fs[f] for f in fks] # If politeness score > 0.0, # the doc is polite, class=1 l = 1 if d['score'] > 0.0 else 0 X.append(fv) y.append(l) X = csr_matrix(np.asarray(X)) y = np.asarray(y) return X, y
X, y = [], [] for d in documents: print(d) fs = vectorizer.features(d) if not fks: fks = sorted(fs.keys()) fv = [fs[f] for f in fks] # If politeness score > 0.0, # the doc is polite, class=1 l = 1 if d['score'] > 0.0 else 0 X.append(fv) y.append(l) X = csr_matrix(np.asarray(X)) y = np.asarray(y) return X, y if __name__ == "__main__": """ Train a dummy model off our 4 sample request docs """ #from politeness.test_documents import TEST_DOCUMENTS codings = pd.read_csv('Pol600withLabel.csv', index_col=0, parse_dates=True) docs = codings['Request'].tolist() documents = PolitenessFeatureVectorizer.preprocess(docs) train_svm(documents, ntesting=1)
from politeness.features.vectorizer import PolitenessFeatureVectorizer ret = PolitenessFeatureVectorizer.preprocess([ "what do you think was the purpose of this Fesselballon? if it wasn't military, what was the point in shooting it down?" ]) print(ret)
def index(): global login if not login: return redirect(url_for('login')) cur = mysql.connection.cursor() # cur.execute("""DROP TABLE IF EXISTS Feedback_Doc;""") # cur.execute("""DROP TABLE IF EXISTS Feedback_Sentence;""") # cur.execute("""DROP TABLE IF EXISTS Input;""") cur.execute("""CREATE TABLE IF NOT EXISTS Input ( input_id INTEGER PRIMARY KEY AUTO_INCREMENT, user_id TEXT, message TEXT, time_stamp TEXT )""") cur.execute("""CREATE TABLE IF NOT EXISTS Feedback_Doc ( input_id INTEGER PRIMARY KEY, word_count INTEGER, label TEXT, impoliteness_score REAL, politeness_score REAL, FOREIGN KEY (input_id) REFERENCES Input(input_id) )""") cur.execute("""CREATE TABLE IF NOT EXISTS Feedback_Sentence ( id INTEGER PRIMARY KEY AUTO_INCREMENT, input_id INTEGER, sentence_content TEXT, label TEXT, impoliteness_score REAL, politeness_score REAL, strategy_count INTEGER, strategies VARCHAR(255), indices VARCHAR(255), FOREIGN KEY (input_id) REFERENCES Input(input_id) )""") label_string = "" input_text = "" title = "" strategies_set = set() highlight_index_set = set() strategies = [] strategies_all = [] if request.method == 'POST': title = request.form['theme'] input_text = request.form['sentence'] # check for grammatical mistakes grammar_check = api.check(input_text, api_url='https://languagetool.org/api/v2/', lang='en-US') grammar_messages = grammar_check['matches'] grammar_corrections, split_input, wrong_words, impolite_words, replacements = [], [], [], [], {} if len(grammar_messages) != 0: for i in range(len(grammar_messages)): # og_msg = grammar_messages[i]['context']['text'] og_msg = input_text offset = grammar_messages[i]['offset'] grammar_corrections.append(grammar_messages[i]['message']) wrong_words.append(og_msg[offset:offset+grammar_messages[i]['length']]) for repl in grammar_messages[i]['replacements']: if i not in replacements: replacements[i] = [repl['value']] else: replacements[i].append(repl['value']) split_input = input_text.split() ### NEEDS TO BE CHANGED LATER... num_corrections = str(len(replacements)) print(wrong_words) print(replacements) # Get politeness score for overall document doc_res = score_text(input_text) print("DOCUMENT POLITENESS:\n", doc_res) label_string = doc_res[0] # Get politeness score for each sentence in document sentence_list = nltk.sent_tokenize(input_text) sent_politeness_res = list() impolite_sentence_indices = dict() for i, sentence in enumerate(sentence_list): ## politeness score res = score_text(sentence) label, impolite_score, polite_score = res[0], res[1], res[2] ## strategies feedback doc = PolitenessFeatureVectorizer.preprocess([sentence])[0] strategies = get_feedback(doc) for strat in strategies: strategies_set.add(strat[0]) highlight_index_set.add(strat[1][0]) sent_politeness_res.append( (sentence, label, impolite_score, polite_score, strategies) ) print("PER SENTENCE POLITENESS\n", sent_politeness_res) # print(sent_politeness_res[0][4]) # print(len(sent_politeness_res[0][4])) strategies_all = sent_politeness_res[0][4] now = datetime.datetime.now().strftime("%b %d %Y %H:%M:%S") cur.execute("INSERT INTO Input (user_id, message, time_stamp) VALUES (%s, %s, %s)", (g.user, input_text, now)) cur.execute("SELECT input_id FROM Input WHERE time_stamp = %s", (now,)) input_id = cur.fetchone()[0] cur.execute("INSERT INTO Feedback_Doc (input_id, word_count, label, impoliteness_score, politeness_score) VALUES (%s, %s, %s, %s, %s)", (input_id, len(input_text.split()), doc_res[0], float(doc_res[1]), float(doc_res[2]))) for m in sent_politeness_res: strategies = [i[0] for i in m[4]] strategies_idx = [i[1] for i in m[4]] cur.execute( "INSERT INTO Feedback_Sentence (input_id, sentence_content, label, impoliteness_score, politeness_score, strategy_count, strategies, indices) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)", (input_id, m[0], m[1], float(m[2]), float(m[3]), len(m[4]), str(strategies), str(strategies_idx))) mysql.connection.commit() print(strategies) # cur.execute("""SELECT * FROM Input""") # print(cur.fetchall(), '\n') # cur.execute("""SELECT * FROM Feedback_Doc""") # print(cur.fetchall(), '\n') # cur.execute("""SELECT * FROM Feedback_Sentence""") # print(cur.fetchall(), '\n') cur.close() original_text = input_text # return render_template('feedback.html', user_input=input_text, label_string=label_string, impoliteness_score=impoliteness_score, politeness_score=politeness_score, strategies=strategies, grammar_msg=grammar_corrections, repl=replacements, split_inputs=split_input, num_errors=num_corrections, mistakes=wrong_words, impolite_ind=impolite_indices, impolite_words=impolite_words) return render_template('new_feedback.html',label_string = label_string, user_input = input_text, title = title,strategies_list = strategies_set, strategies = strategies_all, highlight_index = highlight_index_set)