Exemple #1
0
def extractFeaturesAndLabels(inputFolder):
    dialogueCorpus = taTool.get_data(inputFolder)

    dialogCorpusFeature = []
    dialogCorpusLabel = []

    for dialogSet in dialogueCorpus:
        dialogSetFeature = []
        dialogSetLabel = []
        previousSpeaker = None
        currentSpeaker = None
        for index,dialog in enumerate(dialogSet):
            actTag = dialog.act_tag
            currentSpeaker = dialog.speaker

            # numContinueSpeak = 0
            # while (index + numContinueSpeak + 1) < len(dialogSet) and dialogSet[index + numContinueSpeak + 1].speaker == currentSpeaker:
            #     numContinueSpeak += 1
            # nextConversationLength = 0
            # if (index + 1) < len(dialogSet):
            #     if dialogSet[index+1].pos:
            #         nextConversationLength = len(dialogSet[index+1].pos)
            # feature = generateFeatures(dialog, (currentSpeaker == previousSpeaker), index == 0,index,numContinueSpeak,nextConversationLength,index == (len(dialogSet)-1))
            #
            feature = generateFeatures(dialog, (currentSpeaker == previousSpeaker), index == 0)

            previousSpeaker = currentSpeaker

            dialogSetFeature = dialogSetFeature + [feature]
            dialogSetLabel = dialogSetLabel + [actTag]

        dialogCorpusFeature = dialogCorpusFeature + [dialogSetFeature]
        dialogCorpusLabel = dialogCorpusLabel + [dialogSetLabel]
    return(dialogCorpusFeature,dialogCorpusLabel)
Exemple #2
0
def test_data(args):
    data = tool.get_data(args[1])
    tagger = pycrfsuite.Tagger()
    tagger.open('advanced')
    features = create_features(data)
    output = tagger.tag(features['feature'])
    return {'label': output, 'feature': features}
Exemple #3
0
    def crftrainer(self, directry, data_model):
        conversations = tool.get_data(directry)
        for f_name, conv in conversations:
            value_x, value_y = self.buildmodel(conv)
            self.crf_feature_train.append(value_x, value_y)

        self.crf_feature_train.train(data_model)
 def tag_dir(self, test_dir):
     self.tagger.open('sequence_label_model.crfsuite')
     test_data = get_data(test_dir)
     for dialogue in test_data:
         utterances = dialogue[1]
         features, act_tags = self.get_features_act_tags(utterances)
         self.tag_data[dialogue[0]].extend(self.tagger.tag(features))
Exemple #5
0
def extractFeaturesAndLabels(inputFolder):
    dialogueCorpus = taTool.get_data(inputFolder)

    dialogCorpusFeature = []
    dialogCorpusLabel = []

    for dialogSet in dialogueCorpus:
        dialogSetFeature = []
        dialogSetLabel = []
        previousSpeaker = None
        currentSpeaker = None
        for index, dialog in enumerate(dialogSet):
            actTag = dialog.act_tag
            currentSpeaker = dialog.speaker

            feature = generateFeatures(dialog,
                                       (currentSpeaker == previousSpeaker),
                                       index == 0)

            previousSpeaker = currentSpeaker

            dialogSetFeature = dialogSetFeature + [feature]
            dialogSetLabel = dialogSetLabel + [actTag]

        dialogCorpusFeature = dialogCorpusFeature + [dialogSetFeature]
        dialogCorpusLabel = dialogCorpusLabel + [dialogSetLabel]
    return (dialogCorpusFeature, dialogCorpusLabel)
Exemple #6
0
def test_data(args):
    data = tool.get_data(args[1])
    tagger = pycrfsuite.Tagger()
    tagger.open('baselinecrf')
    features = create_features(data)
    output = tagger.tag(features['feature'])
    print(len(output))
    return {'label': output, 'feature': features}
Exemple #7
0
def read_data(args):
    data = tool.get_data(args[0])
    features = create_features(data)
    return {
        'xtrain': features['feature'],
        'ytrain': features['label'],
        'file': features['file'],
        'length': features['length']
    }
Exemple #8
0
from pprint import pprint

__author__ = "Shurui Liu"
__email__ = "*****@*****.**"

# timer
start = timeit.default_timer()

# inputdir, testdir, and outputfile
# python3 baseline_crf.py 'testdata/inputdir' 'testdata/testdir' 'baseline_output.txt'
inputdir = sys.argv[1]
testdir = sys.argv[2]
outputfile = sys.argv[3]

# all the csv files, data type is generator
train_file = hw3_corpus_tool.get_data(inputdir)
test_file = hw3_corpus_tool.get_data(testdir)

# a list of all the files in inputdir and testdir
train_list = list(train_file)
test_list = list(test_file)

# x_train, y_train
x_train = []
y_train = []
# for all the files in the list
for file in train_list:
    for line in range(len(file) - 1):
        line_feature = []
        #act_tag
        act_tag = file[line][0]
 def scan_input_dir(self, input_dir):
     train_data = get_data(input_dir)
     for dialogue in train_data:
         features, act_tags = self.get_features_act_tags(dialogue[1])
         self.trainer.append(features, act_tags)
Exemple #10
0
 def crfpred(self, datafolder, output):
     conversations = tool.get_data(datafolder)
     self.writetofile(conversations, output)
Exemple #11
0
# create a list
output_tag = []

# read lines from output.txt
with open(textfile) as f:
    for line in f:
        line = line.strip()
        if not line:  # line is blank
            continue
        if line.startswith("Filename"):  # comment line
            continue
        output_tag.append(line)
print(len(output_tag))

# all the csv files, data type is generator
dev_file = hw3_corpus_tool.get_data(devdir)
dev_list = list(dev_file)

dev_tag = []

for file in dev_list:
    for line in range(len(file) - 1):
        #act_tag
        act_tag = file[line][0]
        dev_tag.append(act_tag)

print(len(dev_tag))

# total number of tags, correct tags number
total = len(dev_tag)
correct = 0
Exemple #12
0
from hw3_corpus_tool import get_data
import sys


def check_structure(arr):
    len_arr = len(arr)
    for i in range(len_arr):
        try:
            if len(arr[i]):
                check_structure(arr[i])
        except:
            print "no furthur decomposition"
            print arr[i]


path = sys.argv[1]
print 'path', path
doc = get_data(path)

for i in doc:
    print i