def extract_utterances(traindirectoryPath, testdirectoryPath):
    dialogs_train = h.get_data(traindirectoryPath)
    list_dialogs_train = list(dialogs_train)

    dialogs_test = h.get_data(testdirectoryPath)
    list_dialogs_test = list(dialogs_test)
    return list_dialogs_train, list_dialogs_test
def main():

    convos = get_data(sys.argv[1])
    count=0
    feats=[]
    labels=[]
    x_data = []
    y_data = []
    ct=0
    for c in convos:
        myfeatures, mylabels = extract_features_advcd(c)
        x_data.append(myfeatures)
        y_data.append(mylabels)

    xtrain = x_data
    ytrain = y_data


    trainer = pycrfsuite.Trainer(verbose=False)
    for i in range(len(ytrain)):
        trainer.append(xtrain[i],ytrain[i])


    trainer.set_params({'c1': 1.0,   # coefficient for L1 penalty         
                         'c2': 1e-3,  # coefficient for L2 penalty         
                         'max_iterations': 50,  # stop earlier 
            # include transitions that are possible, but not observed         
                         'feature.possible_transitions': True}) 
    trainer.train('postagger.crfsuite')


    xtest=[]
    ytest=[]
    testconvos = get_data(sys.argv[2])
    for t in testconvos:
        tfeats, tlabels = extract_features_advcd(t)
        xtest.append(tfeats)
        ytest.append(tlabels)


    f = open(sys.argv[3],'w') #need to change to sys argument
    tagger = pycrfsuite.Tagger()
    tagger.open('postagger.crfsuite')
    count_true = 0
    count_false = 0
    for i in range(len(xtest)):
        pred = tagger.tag(xtest[i])
        corr = ytest[i]

        for j in range(len(pred)):
            a = (pred[j]==corr[j])
            if(a==False): count_false+=1
            if(a==True): count_true+=1  
            f.write(pred[j]+"\n")
            
        f.write("\n")
    total=count_true+count_false
    acc = count_true/total
    print("Accuracy of advanced:",acc)
    def evaluate_model(self):
        tagger = pycrfsuite.Tagger()
        tagger.open("dialogue_tagger.crtsuite")

        X_test = [dialogue for dialogue in list(tool.get_data(self.test_dir))]
        y_test = [
            self.extract_labels_from_dialogue(dialogue) for dialogue in X_test
        ]

        no_of_correct_predictions = 0
        total = 0
        predictions = []

        for xseq, yseq in zip(X_test, y_test):
            predicted = tagger.tag(self.generate_features_from_dialogue(xseq))
            predictions.append(predicted)

            for pred, act in zip(predicted, yseq):
                if act:
                    no_of_correct_predictions += (1 if pred == act else 0)
                    total += 1

        accuracy = no_of_correct_predictions / total
        print("Accuracy = " + str(accuracy))
        self.write_labels(predictions)
def data_to_features(data_dir):

    data = list(get_data(data_dir))
    feature_set = [dialogue_to_features(dialogue) for dialogue in data]
    label_set = [dialogue_to_labels(dialogue) for dialogue in data]
    # print(feature_set)
    # print(label_set)
    return feature_set, label_set
def extract_features_and_labels(input_folder):

    ## Fetch the entire dataset (dialogue_set) using read_tool
    dialogue_set = read_tool.get_data(input_folder)

    ## initialize overall feature set for the training dataset
    dialogue_set_features = []
    dialogue_set_labels = []

    ## Set n value for ngram model
    ngram_value = 4

    ## For each conversation (dialogue) in the training dataset
    for dialogue in dialogue_set:

        ## Initialize features for each dialogue
        dialogue_features = []
        dialogue_labels = []
        previous_speaker = None
        current_speaker = None

        ## Initialise a deque to remember features for (n-1) previous utterances
        ngram_queue = deque(maxlen=(ngram_value - 1))

        ## For each utterance in a dialogue
        for index, utterance in enumerate(dialogue):

            ## Fetch tag, current speaker
            act_tag = utterance.act_tag if utterance.act_tag else "DEFAULT_TAG"
            current_speaker = utterance.speaker

            ## Evaluate if speaker is changed or not
            if current_speaker == previous_speaker:
                isSpeakerChange = False
            else:
                isSpeakerChange = True

            ## Evaluate features for the utterance
            feature, advanced_feature = generate_feature(
                utterance, isSpeakerChange, index == 0, ngram_queue)

            ## Assign current speaker as the previous one before moving to next utterance
            previous_speaker = current_speaker

            ## Add the sub_features to deque so that next features can use this as previous features
            ngram_queue.appendleft(advanced_feature)

            ## Append utterance features, labels to the dialog
            dialogue_features += [feature]
            dialogue_labels += [act_tag]

        ## Merge features, labels for the entire dataset
        dialogue_set_features += [dialogue_features]
        dialogue_set_labels += [dialogue_labels]

    return dialogue_set_features, dialogue_set_labels
Esempio n. 6
0
def test(test_dir, output_file):
    tagger = pycrfsuite.Tagger()
    tagger.open("model")
    data = tool.get_data(test_dir)
    with open(output_file, "w") as file:
        for i in data:
            features = analyze_dialog(i)
            predict_labels = tagger.tag(features)
            for tag in predict_labels:
                file.write(f"{tag}\n")
            file.write("\n")
Esempio n. 7
0
def train(input_dir):
    data = tool.get_data(input_dir)
    trainer = pycrfsuite.Trainer()
    for i in data:
        features = analyze_dialog(i)
        labels = get_labels(i)
        trainer.append(features, labels)
    trainer.set_params({
        "c1": 1.0,
        "c2": 1e-3,
        "max_iterations": 50,
        "feature.possible_transitions": True
    })
    trainer.train("model")
    def trainModel(self,data):
        trainer = pycrfsuite.Trainer(verbose=False)
        for dialogues in hwutil.get_data(data):
            x_train = [self.word2features(dialogues, i) for i in range(len(dialogues))]
            y_train = [dialogue.act_tag for dialogue in dialogues]
            trainer.append(x_train, y_train)

        trainer.set_params({
            'c1': 1.0,  # coefficient for l1 penalty
            "c2": 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })

        trainer.train('advanced_model')
Esempio n. 9
0
def extract_features_and_labels(input_folder):

    ## Fetch the entire dataset (dialogue_set) using read_tool
    dialogue_set = read_tool.get_data(input_folder)

    ## initialize overall feature set for the training dataset
    dialogue_set_features = []
    dialogue_set_labels = []

    ## For each conversation (dialogue) in the training dataset
    for dialogue in dialogue_set:

        ## Initialize features for each dialogue
        dialogue_features = []
        dialogue_labels = []
        previous_speaker = None
        current_speaker = None

        ## For each utterance in a dialogue
        for index, utterance in enumerate(dialogue):

            ## Fetch tag, current speaker
            act_tag = utterance.act_tag if utterance.act_tag else "DEFAULT_TAG"
            current_speaker = utterance.speaker

            ## Evaluate if speaker is changed or not
            if current_speaker == previous_speaker:
                isSpeakerChange = False
            else:
                isSpeakerChange = True

            ## Evaluate features for the utterance
            feature = generate_feature(utterance, isSpeakerChange, index == 0)

            ## Assign current speaker as the previous one before moving to next utterance
            previous_speaker = current_speaker

            ## Append utterance features, labels to the dialog
            dialogue_features += [feature]
            dialogue_labels += [act_tag]

        ## Merge features, labels for the entire dataset
        dialogue_set_features += [dialogue_features]
        dialogue_set_labels += [dialogue_labels]

    return dialogue_set_features, dialogue_set_labels
 def testModel(self,testdata, result):
     tagger = pycrfsuite.Tagger()
     tagger.open('advanced_model')
     with open(result, 'w') as opt_file:
         tp = 0
         files = 0
         for dialogue in hwutil.get_data(testdata):
             res = []
             res.extend(tagger.tag([self.word2features(dialogue, i) for i in range(len(dialogue))]))
             for x in res:
                 opt_file.write(x)
                 opt_file.write('\n')
             opt_file.write('\n')
             y_true = [utterance.act_tag for utterance in dialogue]
             files += len(res)
             for y_pred, y in zip(res, y_true):
                 if y_pred == y:
                     tp += 1
         accuracy = tp / files
         print("accuracy = ", accuracy)
    def train_model(self):
        trainer = pycrfsuite.Trainer(verbose=False)
        X_train = [
            dialogue for dialogue in list(tool.get_data(self.input_dir))
        ]
        y_train = [
            self.extract_labels_from_dialogue(dialogue) for dialogue in X_train
        ]

        for xseq, yseq in zip(X_train, y_train):
            trainer.append(self.generate_features_from_dialogue(xseq), yseq)

        trainer.set_params({
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })

        trainer.train("dialogue_tagger.crtsuite")
Esempio n. 12
0
    print(correct, wrong, (correct+wrong))
    accuracy = correct * 100 / (correct+wrong)
    print("Accuracy: "+str(accuracy))
    print(OUTPUT_FILE+" generated!")


    

if __name__ == '__main__':
    start =  time.time()
    # Check len of args for file names 
    if len(sys.argv) >=3:
        TRAIN_DIRECTORY = sys.argv[1]
        TEST_DIRECTORY = sys.argv[2]
        OUTPUT_FILE = sys.argv[3]
    else:
        TRAIN_DIRECTORY = "prof_dataset/train"
        TEST_DIRECTORY = "prof_dataset/test"
        OUTPUT_FILE = "sarthak_baseline.txt"    
    
    print("Loading Training Data from: "+TRAIN_DIRECTORY)
    train_conversation_list = list(tool.get_data(TRAIN_DIRECTORY))
    train_features, train_labels = generate_features(train_conversation_list)
    train_crf(train_features, train_labels)
    print("Loading Test Data from: "+TEST_DIRECTORY)
    test_conversation_list = list(tool.get_data(TEST_DIRECTORY))
    test_features, test_labels = generate_features(test_conversation_list)
    predict(test_features, test_labels)
    
    print("total_time = ",time.time() - start)
    
def generate_features(dir_path):
    features = ([], [])
    for dialog in utils.get_data(dir_path):
        features[0].append(get_feature(dialog))
        features[1].append([utterance.act_tag for utterance in dialog])
    return features
Esempio n. 14
0
import hw2_corpus_tool as hw2
import pycrfsuite
import os
import sys


THIS_FOLDER = os.path.dirname(os.path.abspath(__file__))

train_file = os.path.join(THIS_FOLDER, sys.argv[1])
dev_file = os.path.join(THIS_FOLDER, sys.argv[2])

train_files = list(hw2.get_data(train_file))
dev_files = list(hw2.get_data(dev_file))


def word2features(last_utterance, utterance, curr_utter):
    all_features = []
    if curr_utter == 0:
        last_speaker = curr_speaker = utterance[1]
    else:
        last_speaker = last_utterance[1]
        curr_speaker = utterance[1]
    adjective_pos = ['JJ', 'JJR', 'JJS']
    x = 0
    if not utterance[2]:
        all_features = ['NO_WORDS',
                        'non_verbal=%s' % utterance[3]
                        ]
    else:
        while x < len(utterance[2]):
            if x == 0 and (x != len(utterance[2]) - 1):  # x is the first utterance
Esempio n. 15
0
def calculate_accuracy(correct, total):
    accuracy = (correct / total) * 100
    return accuracy


if __name__ == "__main__":
    if (len(sys.argv) < 4):
        print("Invalid input")
        sys.exit(1)
    trainDir = os.path.abspath(sys.argv[1])
    devDir = os.path.abspath(sys.argv[2])
    outputFile = sys.argv[3]
    featureList = []
    labelList = []
    allData = hw2_corpus_tool.get_data(trainDir)
    for data in allData:
        features, labels = parse(data, True)
        featureList.extend(features)
        labelList.extend(labels)
    trainer = pycrfsuite.Trainer(verbose=False)
    trainer.append(featureList, labelList)
    trainer.set_params({
        'c1': 1.0,
        'c2': 1e-3,
        'max_iterations': 50,
        'feature.possible_transitions': True
    })
    trainer.train('model')
    tagger = pycrfsuite.Tagger()
    tagger.open('model')
Esempio n. 16
0
def get_all_data(directory_name):
    return list(hct.get_data(directory_name))
Esempio n. 17
0
                    pred_correct += 1
    if total > 0:
        print("ADVANCED ACCURACY {}".format(pred_correct / total))
    f.close()
    return tagger


def parse_pos(pos):
    pos = re.sub("[\^]", " ", pos)
    pos = pos.strip()
    pos = re.sub("\\s+", " ", pos)
    return pos


if __name__ == '__main__':
    input_path = sys.argv[1]
    test_dir = sys.argv[2]
    output_file = sys.argv[3]
    data = get_data(input_path)
    X_features = []
    Y_features = []
    for dialog in data:
        x, y = create_features(dialog)
        X_features.append(x)
        if y:
            Y_features.append(y)
    trainer = create_trainer(X_features, Y_features)
    trainer.train('advanced.crfsuite')
    test = get_data(test_dir)
    tagger = evaulate_tagger(test, output_file)
def read(path):

    data = tool.get_data(path)
    return data
        predictor = pycrfsuite.Tagger(verbose=False)
        predictor.open("advanced_dialog_act_tagger.crfsuite")
        output_file = open(OUTPUTFILE, "w+")
        correct_predictions = 0
        total_predictions = 0
        for conversation in range(len(test_features)):
            for label_index, predicted_label in enumerate(
                    predictor.tag(test_features[conversation])):
                if predicted_label == test_labels[conversation][label_index]:
                    correct_predictions += 1
                total_predictions += 1
                predicted_label += "\n"
                output_file.writelines(predicted_label)
            output_file.writelines("\n")
        output_file.close()
        print("Accuracy is ", (correct_predictions / total_predictions))


if __name__ == "__main__":
    start = time.time()
    training_set = list(hw2_corpus_tool.get_data(INPUTDIR))
    dev_set = list(hw2_corpus_tool.get_data(TESTDIR))
    train_features, train_labels = AdvancedTagger.generate_features_and_labels(
        training_set, 3)
    test_features, test_labels = AdvancedTagger.generate_features_and_labels(
        dev_set, 3)
    print("Training model")
    AdvancedTagger.train_model(train_features, train_labels)
    AdvancedTagger.predict(test_features, test_labels)
    print("Time taken (in seconds) :", (time.time() - start))
Esempio n. 20
0
# In[1]:

import hw2_corpus_tool
from hw2_corpus_tool import get_data
import sys

# In[2]:

data_directory = sys.argv[1]
test_directory = sys.argv[2]
output_file = sys.argv[3]

# In[3]:

data = list(get_data(data_directory))
test = list(get_data(test_directory))

# In[4]:

feature_set = []
tag_set = []
for d in data:
    feature_d = []
    tag_d = []
    current_speaker = d[0].speaker
    for du_i in range(len(d)):
        du = d[du_i]
        feature_du = []
        if du_i == 0:
            feature_du.append('first_utterance')