def extract_utterances(traindirectoryPath, testdirectoryPath): dialogs_train = h.get_data(traindirectoryPath) list_dialogs_train = list(dialogs_train) dialogs_test = h.get_data(testdirectoryPath) list_dialogs_test = list(dialogs_test) return list_dialogs_train, list_dialogs_test
def main(): convos = get_data(sys.argv[1]) count=0 feats=[] labels=[] x_data = [] y_data = [] ct=0 for c in convos: myfeatures, mylabels = extract_features_advcd(c) x_data.append(myfeatures) y_data.append(mylabels) xtrain = x_data ytrain = y_data trainer = pycrfsuite.Trainer(verbose=False) for i in range(len(ytrain)): trainer.append(xtrain[i],ytrain[i]) trainer.set_params({'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True}) trainer.train('postagger.crfsuite') xtest=[] ytest=[] testconvos = get_data(sys.argv[2]) for t in testconvos: tfeats, tlabels = extract_features_advcd(t) xtest.append(tfeats) ytest.append(tlabels) f = open(sys.argv[3],'w') #need to change to sys argument tagger = pycrfsuite.Tagger() tagger.open('postagger.crfsuite') count_true = 0 count_false = 0 for i in range(len(xtest)): pred = tagger.tag(xtest[i]) corr = ytest[i] for j in range(len(pred)): a = (pred[j]==corr[j]) if(a==False): count_false+=1 if(a==True): count_true+=1 f.write(pred[j]+"\n") f.write("\n") total=count_true+count_false acc = count_true/total print("Accuracy of advanced:",acc)
def evaluate_model(self): tagger = pycrfsuite.Tagger() tagger.open("dialogue_tagger.crtsuite") X_test = [dialogue for dialogue in list(tool.get_data(self.test_dir))] y_test = [ self.extract_labels_from_dialogue(dialogue) for dialogue in X_test ] no_of_correct_predictions = 0 total = 0 predictions = [] for xseq, yseq in zip(X_test, y_test): predicted = tagger.tag(self.generate_features_from_dialogue(xseq)) predictions.append(predicted) for pred, act in zip(predicted, yseq): if act: no_of_correct_predictions += (1 if pred == act else 0) total += 1 accuracy = no_of_correct_predictions / total print("Accuracy = " + str(accuracy)) self.write_labels(predictions)
def data_to_features(data_dir): data = list(get_data(data_dir)) feature_set = [dialogue_to_features(dialogue) for dialogue in data] label_set = [dialogue_to_labels(dialogue) for dialogue in data] # print(feature_set) # print(label_set) return feature_set, label_set
def extract_features_and_labels(input_folder): ## Fetch the entire dataset (dialogue_set) using read_tool dialogue_set = read_tool.get_data(input_folder) ## initialize overall feature set for the training dataset dialogue_set_features = [] dialogue_set_labels = [] ## Set n value for ngram model ngram_value = 4 ## For each conversation (dialogue) in the training dataset for dialogue in dialogue_set: ## Initialize features for each dialogue dialogue_features = [] dialogue_labels = [] previous_speaker = None current_speaker = None ## Initialise a deque to remember features for (n-1) previous utterances ngram_queue = deque(maxlen=(ngram_value - 1)) ## For each utterance in a dialogue for index, utterance in enumerate(dialogue): ## Fetch tag, current speaker act_tag = utterance.act_tag if utterance.act_tag else "DEFAULT_TAG" current_speaker = utterance.speaker ## Evaluate if speaker is changed or not if current_speaker == previous_speaker: isSpeakerChange = False else: isSpeakerChange = True ## Evaluate features for the utterance feature, advanced_feature = generate_feature( utterance, isSpeakerChange, index == 0, ngram_queue) ## Assign current speaker as the previous one before moving to next utterance previous_speaker = current_speaker ## Add the sub_features to deque so that next features can use this as previous features ngram_queue.appendleft(advanced_feature) ## Append utterance features, labels to the dialog dialogue_features += [feature] dialogue_labels += [act_tag] ## Merge features, labels for the entire dataset dialogue_set_features += [dialogue_features] dialogue_set_labels += [dialogue_labels] return dialogue_set_features, dialogue_set_labels
def test(test_dir, output_file): tagger = pycrfsuite.Tagger() tagger.open("model") data = tool.get_data(test_dir) with open(output_file, "w") as file: for i in data: features = analyze_dialog(i) predict_labels = tagger.tag(features) for tag in predict_labels: file.write(f"{tag}\n") file.write("\n")
def train(input_dir): data = tool.get_data(input_dir) trainer = pycrfsuite.Trainer() for i in data: features = analyze_dialog(i) labels = get_labels(i) trainer.append(features, labels) trainer.set_params({ "c1": 1.0, "c2": 1e-3, "max_iterations": 50, "feature.possible_transitions": True }) trainer.train("model")
def trainModel(self,data): trainer = pycrfsuite.Trainer(verbose=False) for dialogues in hwutil.get_data(data): x_train = [self.word2features(dialogues, i) for i in range(len(dialogues))] y_train = [dialogue.act_tag for dialogue in dialogues] trainer.append(x_train, y_train) trainer.set_params({ 'c1': 1.0, # coefficient for l1 penalty "c2": 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('advanced_model')
def extract_features_and_labels(input_folder): ## Fetch the entire dataset (dialogue_set) using read_tool dialogue_set = read_tool.get_data(input_folder) ## initialize overall feature set for the training dataset dialogue_set_features = [] dialogue_set_labels = [] ## For each conversation (dialogue) in the training dataset for dialogue in dialogue_set: ## Initialize features for each dialogue dialogue_features = [] dialogue_labels = [] previous_speaker = None current_speaker = None ## For each utterance in a dialogue for index, utterance in enumerate(dialogue): ## Fetch tag, current speaker act_tag = utterance.act_tag if utterance.act_tag else "DEFAULT_TAG" current_speaker = utterance.speaker ## Evaluate if speaker is changed or not if current_speaker == previous_speaker: isSpeakerChange = False else: isSpeakerChange = True ## Evaluate features for the utterance feature = generate_feature(utterance, isSpeakerChange, index == 0) ## Assign current speaker as the previous one before moving to next utterance previous_speaker = current_speaker ## Append utterance features, labels to the dialog dialogue_features += [feature] dialogue_labels += [act_tag] ## Merge features, labels for the entire dataset dialogue_set_features += [dialogue_features] dialogue_set_labels += [dialogue_labels] return dialogue_set_features, dialogue_set_labels
def testModel(self,testdata, result): tagger = pycrfsuite.Tagger() tagger.open('advanced_model') with open(result, 'w') as opt_file: tp = 0 files = 0 for dialogue in hwutil.get_data(testdata): res = [] res.extend(tagger.tag([self.word2features(dialogue, i) for i in range(len(dialogue))])) for x in res: opt_file.write(x) opt_file.write('\n') opt_file.write('\n') y_true = [utterance.act_tag for utterance in dialogue] files += len(res) for y_pred, y in zip(res, y_true): if y_pred == y: tp += 1 accuracy = tp / files print("accuracy = ", accuracy)
def train_model(self): trainer = pycrfsuite.Trainer(verbose=False) X_train = [ dialogue for dialogue in list(tool.get_data(self.input_dir)) ] y_train = [ self.extract_labels_from_dialogue(dialogue) for dialogue in X_train ] for xseq, yseq in zip(X_train, y_train): trainer.append(self.generate_features_from_dialogue(xseq), yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train("dialogue_tagger.crtsuite")
print(correct, wrong, (correct+wrong)) accuracy = correct * 100 / (correct+wrong) print("Accuracy: "+str(accuracy)) print(OUTPUT_FILE+" generated!") if __name__ == '__main__': start = time.time() # Check len of args for file names if len(sys.argv) >=3: TRAIN_DIRECTORY = sys.argv[1] TEST_DIRECTORY = sys.argv[2] OUTPUT_FILE = sys.argv[3] else: TRAIN_DIRECTORY = "prof_dataset/train" TEST_DIRECTORY = "prof_dataset/test" OUTPUT_FILE = "sarthak_baseline.txt" print("Loading Training Data from: "+TRAIN_DIRECTORY) train_conversation_list = list(tool.get_data(TRAIN_DIRECTORY)) train_features, train_labels = generate_features(train_conversation_list) train_crf(train_features, train_labels) print("Loading Test Data from: "+TEST_DIRECTORY) test_conversation_list = list(tool.get_data(TEST_DIRECTORY)) test_features, test_labels = generate_features(test_conversation_list) predict(test_features, test_labels) print("total_time = ",time.time() - start)
def generate_features(dir_path): features = ([], []) for dialog in utils.get_data(dir_path): features[0].append(get_feature(dialog)) features[1].append([utterance.act_tag for utterance in dialog]) return features
import hw2_corpus_tool as hw2 import pycrfsuite import os import sys THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) train_file = os.path.join(THIS_FOLDER, sys.argv[1]) dev_file = os.path.join(THIS_FOLDER, sys.argv[2]) train_files = list(hw2.get_data(train_file)) dev_files = list(hw2.get_data(dev_file)) def word2features(last_utterance, utterance, curr_utter): all_features = [] if curr_utter == 0: last_speaker = curr_speaker = utterance[1] else: last_speaker = last_utterance[1] curr_speaker = utterance[1] adjective_pos = ['JJ', 'JJR', 'JJS'] x = 0 if not utterance[2]: all_features = ['NO_WORDS', 'non_verbal=%s' % utterance[3] ] else: while x < len(utterance[2]): if x == 0 and (x != len(utterance[2]) - 1): # x is the first utterance
def calculate_accuracy(correct, total): accuracy = (correct / total) * 100 return accuracy if __name__ == "__main__": if (len(sys.argv) < 4): print("Invalid input") sys.exit(1) trainDir = os.path.abspath(sys.argv[1]) devDir = os.path.abspath(sys.argv[2]) outputFile = sys.argv[3] featureList = [] labelList = [] allData = hw2_corpus_tool.get_data(trainDir) for data in allData: features, labels = parse(data, True) featureList.extend(features) labelList.extend(labels) trainer = pycrfsuite.Trainer(verbose=False) trainer.append(featureList, labelList) trainer.set_params({ 'c1': 1.0, 'c2': 1e-3, 'max_iterations': 50, 'feature.possible_transitions': True }) trainer.train('model') tagger = pycrfsuite.Tagger() tagger.open('model')
def get_all_data(directory_name): return list(hct.get_data(directory_name))
pred_correct += 1 if total > 0: print("ADVANCED ACCURACY {}".format(pred_correct / total)) f.close() return tagger def parse_pos(pos): pos = re.sub("[\^]", " ", pos) pos = pos.strip() pos = re.sub("\\s+", " ", pos) return pos if __name__ == '__main__': input_path = sys.argv[1] test_dir = sys.argv[2] output_file = sys.argv[3] data = get_data(input_path) X_features = [] Y_features = [] for dialog in data: x, y = create_features(dialog) X_features.append(x) if y: Y_features.append(y) trainer = create_trainer(X_features, Y_features) trainer.train('advanced.crfsuite') test = get_data(test_dir) tagger = evaulate_tagger(test, output_file)
def read(path): data = tool.get_data(path) return data
predictor = pycrfsuite.Tagger(verbose=False) predictor.open("advanced_dialog_act_tagger.crfsuite") output_file = open(OUTPUTFILE, "w+") correct_predictions = 0 total_predictions = 0 for conversation in range(len(test_features)): for label_index, predicted_label in enumerate( predictor.tag(test_features[conversation])): if predicted_label == test_labels[conversation][label_index]: correct_predictions += 1 total_predictions += 1 predicted_label += "\n" output_file.writelines(predicted_label) output_file.writelines("\n") output_file.close() print("Accuracy is ", (correct_predictions / total_predictions)) if __name__ == "__main__": start = time.time() training_set = list(hw2_corpus_tool.get_data(INPUTDIR)) dev_set = list(hw2_corpus_tool.get_data(TESTDIR)) train_features, train_labels = AdvancedTagger.generate_features_and_labels( training_set, 3) test_features, test_labels = AdvancedTagger.generate_features_and_labels( dev_set, 3) print("Training model") AdvancedTagger.train_model(train_features, train_labels) AdvancedTagger.predict(test_features, test_labels) print("Time taken (in seconds) :", (time.time() - start))
# In[1]: import hw2_corpus_tool from hw2_corpus_tool import get_data import sys # In[2]: data_directory = sys.argv[1] test_directory = sys.argv[2] output_file = sys.argv[3] # In[3]: data = list(get_data(data_directory)) test = list(get_data(test_directory)) # In[4]: feature_set = [] tag_set = [] for d in data: feature_d = [] tag_d = [] current_speaker = d[0].speaker for du_i in range(len(d)): du = d[du_i] feature_du = [] if du_i == 0: feature_du.append('first_utterance')