def stylistic_features(): data = [] data_target = [] for pair, conversation in speaker_pairs.iteritems(): this_vector = [] replies_x = get_replies(conversation, "x") replies_y = get_replies(conversation, "y") # print pair # # print conversation # # print "" # # print "" # print "REPLIES_X " + replies_x # print "REPLIES_Y " + replies_y # print "" # print "" avg_x = len(utils.tokenize_utterance(replies_x)) / len( conversation ) ##using future import above to have float number out of int division avg_y = len(utils.tokenize_utterance(replies_y)) / len(conversation) x_marker_count = utils.get_liwc_counts_from_utterance(replies_x) y_marker_count = utils.get_liwc_counts_from_utterance(replies_y) this_vector = this_vector + list(x_marker_count) + list(y_marker_count) this_vector.append(avg_x) this_vector.append(avg_y) x = all_utterances[conversation[0][0]] y = all_utterances[conversation[0][1]] if x["is_justice"] and not y["is_justice"]: label = high elif not x["is_justice"] and y["is_justice"]: label = low else: label = error if label != error: data.append(this_vector) data_target.append(label) # print "Data" + str(data) # print "Target" + str(data_target) return (data, data_target)
def count_exhibits_feature(utterance_pair, speaker): utterance = all_utterances[utterance_pair[speaker]]["utterance"] utter_vec = utils.get_liwc_counts_from_utterance(utterance) exhibits_feature_counts = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] for marker_id in range(0, 8): if utter_vec[marker_id] > 0: exhibits_feature_counts[marker_id] = exhibits_feature_counts[marker_id] + 1.0 return exhibits_feature_counts
def count_coordination(utterance_pair): b_utterance = all_utterances[utterance_pair[1]]["utterance"] a_utterance = all_utterances[utterance_pair[0]]["utterance"] b_utter_vec = utils.get_liwc_counts_from_utterance(b_utterance) a_utter_vec = utils.get_liwc_counts_from_utterance(a_utterance) tokenized_b = utils.tokenize_utterance(b_utterance) tokenized_a = utils.tokenize_utterance(a_utterance) # throw this conversation out if difference in utterance length is greater than 20 if abs(len(tokenized_b) - len(tokenized_a)) >= 20: return None coordination_counts = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] for marker_id in range(0, 8): if (a_utter_vec[marker_id] > 0) and (b_utter_vec[marker_id] > 0): coordination_counts[marker_id] = coordination_counts[marker_id] + 1.0 return coordination_counts
def get_liwc_features(combined_utterances): return list(utils.get_liwc_counts_from_utterance(combined_utterances))