Beispiel #1
0
    def __init__(self, args, dataset_path):
        self.name = type(self).__name__
        corpus = mrda.CorpusReader(dataset_path)
        # train, test splits standard
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = len(DAMSL_TAGSET)

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            self.total_length += 1
            if args.truncate_dataset and self.total_length > 20:
                break
            dataset.append(MeetingRecoder.Dialogue(transcript))

        #TODO: Exact test-dev split for mrda //actually do cross validation
        if args.truncate_dataset:
            self.train_dataset = dataset[:10]
            self.valid_dataset = dataset[10:15]
            self.test_dataset = dataset[15:20]
        else:
            self.train_dataset = dataset[:45]
            self.valid_dataset = dataset[45:60]
            self.test_dataset = dataset[60:]

        ## create vocabulary from training data (UNKS  during test time)
        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()
Beispiel #2
0
    def __init__(self, args, dataset_path):
        self.name = type(self).__name__
        corpus = call.CorpusReader(dataset_path)
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = 0

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            if args.truncate_dataset and self.total_length > 25:
                break
            dataset.append(CallHomeEnglish.Dialogue(transcript))
            self.total_length += 1

        if args.truncate_dataset:
            self.train_dataset = dataset[:15]
            self.valid_dataset = dataset[15:20]
            self.test_dataset = dataset[20:]
        else:
            ## depending on what task (in args) you can choose to return only a subset that is annotated for DA
            self.train_dataset = dataset[:140]
            self.valid_dataset = dataset[141:155]
            self.test_dataset = dataset[156:]

        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()
 def __init__(self):
     self.name = ""
     self.total_length = 0
     self.vocabulary = Vocabulary()
     self.train_dataset = []
     self.test_dataset = []
     self.valid_dataset = []
     self.label_set_size = 0
	def __init__(self, args, dataset_path):
		self.name = type(self).__name__
		corpus = ami.CorpusReader(dataset_path)
		self.total_length = 0
		self.vocabulary = Vocabulary()
		self.label_set_size = len(AMI_DIALOGUE_TAGSET)

		dataset = []
		for transcript in corpus.iter_transcripts(display_progress=True):
			self.total_length += 1
			if args.truncate_dataset and self.total_length > 25:
				break
			dataset.append(AmericanMeetingCorpus.Dialogue(transcript))


		if args.truncate_dataset:
			self.train_dataset = dataset[:15]
			self.valid_dataset = dataset[15:20]
			self.test_dataset = dataset[20:]
		else:
			## depending on what task (in args) you can choose to return only a subset that is annotated for DA
			self.train_dataset = []
			self.valid_dataset = []
			self.test_dataset = []
			for dialogue in dataset:
				if dialogue.id[:-1] in TRAIN_SPLIT:
					self.train_dataset.append(dialogue)
				elif dialogue.id[:-1] in DEV_SPLIT:
					self.valid_dataset.append(dialogue)
				elif dialogue.id[:-1] in TEST_SPLIT:
					self.test_dataset.append(dialogue)

		for data_point in self.train_dataset:
			for utterance in data_point.utterances:
				self.vocabulary.add_and_get_indices(utterance.tokens)

		if args.limit_vocabulary:
			self.vocabulary.truncate()

		## create character vocabulary
		self.vocabulary.get_character_vocab()
Beispiel #5
0
class MeetingRecoder(AbstractDataset):
    class Utterance:
        def __init__(self, utterance):
            self.id = utterance.utterance_id
            ##mapping between DAMSL and tagset used in SWDA
            self.da_tag = DAMSL_TAGSET[
                utterance.da_tag.strip()] - 1  # index for DAMSL starts from 1
            self.speaker = utterance.speaker
            self.tokens = utterance.original_text
            self.length = len(self.tokens)

    class Dialogue:
        def __init__(self, transcript):
            self.conversation_no = transcript.conversation_id
            self.conversation_length = len(transcript.utterances)
            self.utterances = []
            for utterance in transcript.utterances:
                ## only consider data subset that can be tagged with mrda damsl tags
                if utterance.da_tag in MRDA_DAMSL_MAP and MRDA_DAMSL_MAP[
                        utterance.da_tag] != "##" and MRDA_DAMSL_MAP[
                            utterance.da_tag] in DAMSL_TAGSET:
                    self.utterances.append(MeetingRecoder.Utterance(utterance))

    #

    def __init__(self, args, dataset_path):
        self.name = type(self).__name__
        corpus = mrda.CorpusReader(dataset_path)
        # train, test splits standard
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = len(DAMSL_TAGSET)

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            self.total_length += 1
            if args.truncate_dataset and self.total_length > 20:
                break
            dataset.append(MeetingRecoder.Dialogue(transcript))

        #TODO: Exact test-dev split for mrda //actually do cross validation
        if args.truncate_dataset:
            self.train_dataset = dataset[:10]
            self.valid_dataset = dataset[10:15]
            self.test_dataset = dataset[15:20]
        else:
            self.train_dataset = dataset[:45]
            self.valid_dataset = dataset[45:60]
            self.test_dataset = dataset[60:]

        ## create vocabulary from training data (UNKS  during test time)
        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()
    def __init__(self, args, dataset_path):
        corpus = swda.CorpusReader(dataset_path)
        self.name = type(self).__name__
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = len(DAMSL_TAGSET)

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            self.total_length += 1
            if args.truncate_dataset and self.total_length > 25:
                break
            dataset.append(SwitchBoard.Dialogue(transcript))

        shuffled_dataset = random.shuffle(dataset)

        ## 1155 transcribed datapoints ; 1115, 19, 21 split
        if args.truncate_dataset:
            self.train_dataset = dataset[:15]
            self.valid_dataset = dataset[15:20]
            self.test_dataset = dataset[20:]
        else:
            ##TODO: this split adheres to numbers reporteed by Schriberg et. al., but ideally cross-validation should be done
            self.train_dataset = dataset[:1115]
            self.valid_dataset = dataset[1115:1134]
            self.test_dataset = dataset[1134:]

        ## create vocabulary from training data (unks  during test time)
        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()
Beispiel #7
0
class CallHomeEnglish(AbstractDataset):
    class Utterance:
        def __init__(self, id, utterance):
            self.name = "call_home_eng"
            self.id = utterance.utterance_id
            self.speaker = utterance.speaker
            self.tokens = utterance.tokens
            self.length = len(self.tokens)
            self.start_time = utterance.start_time
            self.end_time = utterance.end_time

    class Dialogue:
        def __init__(self, transcript):
            self.id = transcript.conversation_no
            self.utterances = []
            for id, utterance in enumerate(transcript.utterances):
                self.utterances.append(CallHomeEnglish.Utterance(
                    id, utterance))
            self.length = len(self.utterances)

    def __init__(self, args, dataset_path):
        self.name = type(self).__name__
        corpus = call.CorpusReader(dataset_path)
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = 0

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            if args.truncate_dataset and self.total_length > 25:
                break
            dataset.append(CallHomeEnglish.Dialogue(transcript))
            self.total_length += 1

        if args.truncate_dataset:
            self.train_dataset = dataset[:15]
            self.valid_dataset = dataset[15:20]
            self.test_dataset = dataset[20:]
        else:
            ## depending on what task (in args) you can choose to return only a subset that is annotated for DA
            self.train_dataset = dataset[:140]
            self.valid_dataset = dataset[141:155]
            self.test_dataset = dataset[156:]

        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()
Beispiel #8
0
def getVocabulary(pairs,
                  input_lang,
                  output_lang,
                  max_vocab_size,
                  reverse=False,
                  start_end_tokens=True):
    """
    generate vocabularies for the pairs
    :param list pairs: language sentence pairs
    :param str input_lang: input language name
    :param str output_lang: output language name
    :param int max_vocab_size: max vocabulary size
    :param bool reverse: whether to inverse the input and output sentences
    :param bool start_end_tokens: whether to use start and end tokens
    :return: two vocabularies
    """
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]

    # initialise vocabularies
    input_vocab = Vocabulary(name=input_lang,
                             start_end_tokens=start_end_tokens,
                             max_vocab_size=max_vocab_size)
    output_vocab = Vocabulary(name=output_lang,
                              start_end_tokens=start_end_tokens,
                              max_vocab_size=max_vocab_size)
    input_sentences = []
    output_sentences = []

    # build vocabularies
    for pair in pairs:
        input_sentences.append(pair[0])
        output_sentences.append(pair[1])
    input_vocab.buildVocabulary(input_sentences)
    output_vocab.buildVocabulary(output_sentences)

    return input_vocab, output_vocab
class AmericanMeetingCorpus(AbstractDataset):
	class Utterance:
		def __init__(self, id, utterance):
			self.name = "ami"
			self.id = utterance.utterance_id
			self.label = utterance.dialogue_act
			self.speaker = utterance.speaker
			self.tokens = utterance.tokens
			self.length = len(self.tokens)
			self.start_time = utterance.start_time
			self.end_time = utterance.end_time

	class Dialogue:
		def __init__(self, transcript):
			self.id = transcript.conversation_no
			self.utterances = []
			for id, utterance in enumerate(transcript.utterances):
				self.utterances.append(AmericanMeetingCorpus.Utterance(id, utterance))
			self.length = len(self.utterances)


	def __init__(self, args, dataset_path):
		self.name = type(self).__name__
		corpus = ami.CorpusReader(dataset_path)
		self.total_length = 0
		self.vocabulary = Vocabulary()
		self.label_set_size = len(AMI_DIALOGUE_TAGSET)

		dataset = []
		for transcript in corpus.iter_transcripts(display_progress=True):
			self.total_length += 1
			if args.truncate_dataset and self.total_length > 25:
				break
			dataset.append(AmericanMeetingCorpus.Dialogue(transcript))


		if args.truncate_dataset:
			self.train_dataset = dataset[:15]
			self.valid_dataset = dataset[15:20]
			self.test_dataset = dataset[20:]
		else:
			## depending on what task (in args) you can choose to return only a subset that is annotated for DA
			self.train_dataset = []
			self.valid_dataset = []
			self.test_dataset = []
			for dialogue in dataset:
				if dialogue.id[:-1] in TRAIN_SPLIT:
					self.train_dataset.append(dialogue)
				elif dialogue.id[:-1] in DEV_SPLIT:
					self.valid_dataset.append(dialogue)
				elif dialogue.id[:-1] in TEST_SPLIT:
					self.test_dataset.append(dialogue)

		for data_point in self.train_dataset:
			for utterance in data_point.utterances:
				self.vocabulary.add_and_get_indices(utterance.tokens)

		if args.limit_vocabulary:
			self.vocabulary.truncate()

		## create character vocabulary
		self.vocabulary.get_character_vocab()
class SwitchBoard(AbstractDataset):
    class Utterance:
        ## minimum elements all datasets must have; id, length, tokens
        def __init__(self, id, utterance):
            self.name = "swda"
            self.index = utterance.utterance_index
            self.id = id
            self.label = DAMSL_TAGSET[utterance.damsl_act_tag().strip(
            )]  # index for DAMSL starts from 1
            self.speaker = utterance.caller
            #TODO: clean text before processing
            self.tokens = utterance.text_words()
            self.length = len(self.tokens)
            self.pos = utterance.regularize_pos_lemmas()

    class Dialogue:
        ## minimum elements all datasets must have; id, length, utterances
        def __init__(self, transcript):
            self.id = transcript.conversation_no
            ## length of transcript not same as number of utterances
            self.length = transcript.length
            self.conversation_topic = transcript.topic_description
            self.utterances = []
            for id, utterance in enumerate(transcript.utterances):
                self.utterances.append(SwitchBoard.Utterance(id, utterance))

    def __init__(self, args, dataset_path):
        corpus = swda.CorpusReader(dataset_path)
        self.name = type(self).__name__
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = len(DAMSL_TAGSET)

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            self.total_length += 1
            if args.truncate_dataset and self.total_length > 25:
                break
            dataset.append(SwitchBoard.Dialogue(transcript))

        shuffled_dataset = random.shuffle(dataset)

        ## 1155 transcribed datapoints ; 1115, 19, 21 split
        if args.truncate_dataset:
            self.train_dataset = dataset[:15]
            self.valid_dataset = dataset[15:20]
            self.test_dataset = dataset[20:]
        else:
            ##TODO: this split adheres to numbers reporteed by Schriberg et. al., but ideally cross-validation should be done
            self.train_dataset = dataset[:1115]
            self.valid_dataset = dataset[1115:1134]
            self.test_dataset = dataset[1134:]

        ## create vocabulary from training data (unks  during test time)
        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()