Esempio n. 1
0
    def __init__(self, args, dataset_path):
        self.name = type(self).__name__
        corpus = call.CorpusReader(dataset_path)
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = 0

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            if args.truncate_dataset and self.total_length > 25:
                break
            dataset.append(CallHomeEnglish.Dialogue(transcript))
            self.total_length += 1

        if args.truncate_dataset:
            self.train_dataset = dataset[:15]
            self.valid_dataset = dataset[15:20]
            self.test_dataset = dataset[20:]
        else:
            ## depending on what task (in args) you can choose to return only a subset that is annotated for DA
            self.train_dataset = dataset[:140]
            self.valid_dataset = dataset[141:155]
            self.test_dataset = dataset[156:]

        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()
Esempio n. 2
0
    def __init__(self, args, dataset_path):
        self.name = type(self).__name__
        corpus = mrda.CorpusReader(dataset_path)
        # train, test splits standard
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = len(DAMSL_TAGSET)

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            self.total_length += 1
            if args.truncate_dataset and self.total_length > 20:
                break
            dataset.append(MeetingRecoder.Dialogue(transcript))

        #TODO: Exact test-dev split for mrda //actually do cross validation
        if args.truncate_dataset:
            self.train_dataset = dataset[:10]
            self.valid_dataset = dataset[10:15]
            self.test_dataset = dataset[15:20]
        else:
            self.train_dataset = dataset[:45]
            self.valid_dataset = dataset[45:60]
            self.test_dataset = dataset[60:]

        ## create vocabulary from training data (UNKS  during test time)
        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()
 def __init__(self):
     self.name = ""
     self.total_length = 0
     self.vocabulary = Vocabulary()
     self.train_dataset = []
     self.test_dataset = []
     self.valid_dataset = []
     self.label_set_size = 0
Esempio n. 4
0
def getVocabulary(pairs,
                  input_lang,
                  output_lang,
                  max_vocab_size,
                  reverse=False,
                  start_end_tokens=True):
    """
    generate vocabularies for the pairs
    :param list pairs: language sentence pairs
    :param str input_lang: input language name
    :param str output_lang: output language name
    :param int max_vocab_size: max vocabulary size
    :param bool reverse: whether to inverse the input and output sentences
    :param bool start_end_tokens: whether to use start and end tokens
    :return: two vocabularies
    """
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]

    # initialise vocabularies
    input_vocab = Vocabulary(name=input_lang,
                             start_end_tokens=start_end_tokens,
                             max_vocab_size=max_vocab_size)
    output_vocab = Vocabulary(name=output_lang,
                              start_end_tokens=start_end_tokens,
                              max_vocab_size=max_vocab_size)
    input_sentences = []
    output_sentences = []

    # build vocabularies
    for pair in pairs:
        input_sentences.append(pair[0])
        output_sentences.append(pair[1])
    input_vocab.buildVocabulary(input_sentences)
    output_vocab.buildVocabulary(output_sentences)

    return input_vocab, output_vocab
	def __init__(self, args, dataset_path):
		self.name = type(self).__name__
		corpus = ami.CorpusReader(dataset_path)
		self.total_length = 0
		self.vocabulary = Vocabulary()
		self.label_set_size = len(AMI_DIALOGUE_TAGSET)

		dataset = []
		for transcript in corpus.iter_transcripts(display_progress=True):
			self.total_length += 1
			if args.truncate_dataset and self.total_length > 25:
				break
			dataset.append(AmericanMeetingCorpus.Dialogue(transcript))


		if args.truncate_dataset:
			self.train_dataset = dataset[:15]
			self.valid_dataset = dataset[15:20]
			self.test_dataset = dataset[20:]
		else:
			## depending on what task (in args) you can choose to return only a subset that is annotated for DA
			self.train_dataset = []
			self.valid_dataset = []
			self.test_dataset = []
			for dialogue in dataset:
				if dialogue.id[:-1] in TRAIN_SPLIT:
					self.train_dataset.append(dialogue)
				elif dialogue.id[:-1] in DEV_SPLIT:
					self.valid_dataset.append(dialogue)
				elif dialogue.id[:-1] in TEST_SPLIT:
					self.test_dataset.append(dialogue)

		for data_point in self.train_dataset:
			for utterance in data_point.utterances:
				self.vocabulary.add_and_get_indices(utterance.tokens)

		if args.limit_vocabulary:
			self.vocabulary.truncate()

		## create character vocabulary
		self.vocabulary.get_character_vocab()
    def __init__(self, args, dataset_path):
        corpus = swda.CorpusReader(dataset_path)
        self.name = type(self).__name__
        self.total_length = 0
        self.vocabulary = Vocabulary()
        self.label_set_size = len(DAMSL_TAGSET)

        dataset = []
        for transcript in corpus.iter_transcripts(display_progress=True):
            self.total_length += 1
            if args.truncate_dataset and self.total_length > 25:
                break
            dataset.append(SwitchBoard.Dialogue(transcript))

        shuffled_dataset = random.shuffle(dataset)

        ## 1155 transcribed datapoints ; 1115, 19, 21 split
        if args.truncate_dataset:
            self.train_dataset = dataset[:15]
            self.valid_dataset = dataset[15:20]
            self.test_dataset = dataset[20:]
        else:
            ##TODO: this split adheres to numbers reporteed by Schriberg et. al., but ideally cross-validation should be done
            self.train_dataset = dataset[:1115]
            self.valid_dataset = dataset[1115:1134]
            self.test_dataset = dataset[1134:]

        ## create vocabulary from training data (unks  during test time)
        for data_point in self.train_dataset:
            for utterance in data_point.utterances:
                self.vocabulary.add_and_get_indices(utterance.tokens)

        if args.limit_vocabulary:
            self.vocabulary.truncate()

        ## create character vocabulary
        self.vocabulary.get_character_vocab()