def load_swda_corpus_data(swda_directory):
    print('Loading SwDA Corpus...')
    corpus_reader = CorpusReader(swda_directory)

    talks = []
    talk_names = []
    tags_seen = set()
    tag_occurances = {}
    for transcript in corpus_reader.iter_transcripts(False):
        name = 'sw' + str(transcript.conversation_no)
        talk_names.append(name)
        conversation_content = []
        conversation_tags = []
        for utterance in transcript.utterances:
            conversation_content.append( utterance.text_words(True) )
            tag = utterance.damsl_act_tag()
            conversation_tags.append( tag )
            if tag not in tags_seen:
                tags_seen.add(tag)
                tag_occurances[tag] = 1
            else:
                tag_occurances[tag] += 1
        talks.append( (conversation_content, conversation_tags) )

    print('\nFound ' + str(len(tags_seen))+ ' different utterance tags.\n')

    tag_indices = {tag:i for i, tag in enumerate(sorted(list(tags_seen)))}

    for talk in talks:
        talk_tags = talk[1]
        for i, tag in enumerate(talk_tags):
            talk_tags[i] = tag_indices[ tag ]

    print('Loaded SwDA Corpus.')
    return talks, talk_names, tag_indices, tag_occurances
Example #2
0
    def __init__(self, data_dir, tokenizer, word2id, task='', seed=42):
        self.corpus = CorpusReader(data_dir)
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.word2id = word2id
        self.task = task

        # self.utt_num = 0
        # for utt in self.corpus.iter_utterances():
            # self.utt_num += 1

        self.trans_num = 1155
        self.deleted_utterances = 0
        self.deleted_tokens = 0
        self.in_tokens = 0
        # for trans in self.corpus.iter_transcripts():
            # self.trans_num += 1

        self.da2num = switchboard_da_mapping()

        self.stopwords = get_stopwords(data_dir, word2id)

        # CAUTION: make sure that for each task the seed is the same s.t. the splits will be the same!
        train_ixs, val_ixs = train_test_split(range(self.trans_num), shuffle=True, train_size=0.8, random_state=seed)
        val_ixs, test_ixs = train_test_split(val_ixs, shuffle=True, train_size=0.5, random_state=seed)
        self.train_ixs, self.val_ixs, self.test_ixs = train_ixs, val_ixs, test_ixs

        self.utt_da_pairs = []
        prev_da = "%"
        for i, utt in enumerate(self.corpus.iter_utterances()):
            sentence = self.clean_utt(utt.text)
            if not sentence: continue
            
            self.in_tokens += len(sentence)
            sentence = self.word2id(sentence)
            if len(sentence) == 0:
                continue

            if not self.utt_acceptable(sentence):
                continue

            act = utt.damsl_act_tag()
            if act == None: act = "%"
            if act == "+": act = prev_da

            _, swda_name = os.path.split(utt.swda_filename)
            swda_name = swda_name[:-4] if swda_name.endswith('.csv') else swda_name

            ix = utt.utterance_index

            self.utt_da_pairs.append((sentence, act, swda_name, ix))
Example #3
0
    def __init__(self, data_dir, tokenizer, word2id, task='', seed=42):
        self.corpus = CorpusReader(data_dir)
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.word2id = word2id
        self.task = task

        self.utt_num = 0
        for utt in self.corpus.iter_utterances():
            self.utt_num += 1

        self.trans_num = 0
        for trans in self.corpus.iter_transcripts():
            self.trans_num += 1

        self.da2num = switchboard_da_mapping()

        # CAUTION: make sure that for each task the seed is the same s.t. the splits will be the same!
        train_ixs, val_ixs = train_test_split(range(self.trans_num),
                                              shuffle=True,
                                              train_size=0.8,
                                              random_state=seed)
        val_ixs, test_ixs = train_test_split(val_ixs,
                                             shuffle=True,
                                             train_size=0.5,
                                             random_state=seed)
        self.train_ixs, self.val_ixs, self.test_ixs = train_ixs, val_ixs, test_ixs

        self.utt_da_pairs = []
        prev_da = "%"
        for i, utt in enumerate(self.corpus.iter_utterances()):
            sentence = re.sub(r"([+/\}\[\]]|\{\w)", "", utt.text)

            sentence = self.word2id(self.tokenizer(sentence))
            act = utt.damsl_act_tag()
            if act == None: act = "%"
            if act == "+": act = prev_da

            _, swda_name = os.path.split(utt.swda_filename)
            swda_name = swda_name[:-4] if swda_name.endswith(
                '.csv') else swda_name

            ix = utt.utterance_index

            self.utt_da_pairs.append((sentence, act, swda_name, ix))
Example #4
0
def load_swda_corpus_data(swda_directory):
    print('Loading SwDA Corpus...')
    corpus_reader = CorpusReader(swda_directory)

    talks = []
    talk_names = []
    tags_seen = {}
    tag_occurances = {}
    num_tags_seen = 0
    X=[]
    Y=[]
    for transcript in corpus_reader.iter_transcripts(False):
        name = 'sw' + str(transcript.conversation_no)
        talk_names.append(name)
        conversation_content = []
        conversation_tags = []
        for utterance in transcript.utterances:
            conversation_content.append( utterance.text_words(True) )
            tag = utterance.damsl_act_tag()
            conversation_tags.append( tag )
            if tag not in tags_seen:
                tags_seen[tag] = num_tags_seen
                num_tags_seen += 1
                tag_occurances[tag] = 1
            else:
                tag_occurances[tag] += 1

        talks.append( (conversation_content, conversation_tags) )
        X.append(conversation_content)
        Y.append(conversation_tags)

    print('\nFound ' + str(len(tags_seen))+ ' different utterance tags.\n')

    for talk in talks:
        conversation_tags = talk[1]
        for i in range(len(conversation_tags)):
            conversation_tags[i] = tags_seen[ conversation_tags[i] ]

    print('Loaded SwDA Corpus.')
    return X,Y,talks, talk_names, tags_seen, tag_occurances
Example #5
0
class SwitchboardConverter:
    def __init__(self, data_dir, tokenizer, word2id, task='', seed=42):
        self.corpus = CorpusReader(data_dir)
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.word2id = word2id
        self.task = task

        # self.utt_num = 0
        # for utt in self.corpus.iter_utterances():
            # self.utt_num += 1

        self.trans_num = 1155
        self.deleted_utterances = 0
        self.deleted_tokens = 0
        self.in_tokens = 0
        # for trans in self.corpus.iter_transcripts():
            # self.trans_num += 1

        self.da2num = switchboard_da_mapping()

        self.stopwords = get_stopwords(data_dir, word2id)

        # CAUTION: make sure that for each task the seed is the same s.t. the splits will be the same!
        train_ixs, val_ixs = train_test_split(range(self.trans_num), shuffle=True, train_size=0.8, random_state=seed)
        val_ixs, test_ixs = train_test_split(val_ixs, shuffle=True, train_size=0.5, random_state=seed)
        self.train_ixs, self.val_ixs, self.test_ixs = train_ixs, val_ixs, test_ixs

        self.utt_da_pairs = []
        prev_da = "%"
        for i, utt in enumerate(self.corpus.iter_utterances()):
            sentence = self.clean_utt(utt.text)
            if not sentence: continue
            
            self.in_tokens += len(sentence)
            sentence = self.word2id(sentence)
            if len(sentence) == 0:
                continue

            if not self.utt_acceptable(sentence):
                continue

            act = utt.damsl_act_tag()
            if act == None: act = "%"
            if act == "+": act = prev_da

            _, swda_name = os.path.split(utt.swda_filename)
            swda_name = swda_name[:-4] if swda_name.endswith('.csv') else swda_name

            ix = utt.utterance_index

            self.utt_da_pairs.append((sentence, act, swda_name, ix))

    def clean_utt(self, utterance):
        utterance, cnt = re.subn(r"([+/\}\[\],\-\(\)#]|\{\w)", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"<+.*>+", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"\*\w+", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r">[\s\w'?]+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"\*.+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"\^\w+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"^uh+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"(uh+)", "", utterance) ; self.deleted_tokens += cnt
        
        ml = re.search("(^\s*\.\s*$)|>", utterance)
        if ml:
            self.deleted_utterances += 1
            return None
        utterance = [w.lower() for w in utterance.split(" ") if len(w) > 0 and not re.search("[Uu][Hh]+", w)]

        return utterance

    def utt_acceptable(self, utt):
        # check whether an utterance is acceptable for perturbation
        if not utt: return False

        stop_cnt = 0
        for w in utt:
            if w in self.stopwords:
                stop_cnt += 1

        if  (len(utt)-stop_cnt >= 4): # (float(stop_cnt) / float(len(utt))) < 0.999 and
            return True
        return False

    def draw_rand_sent(self):
        r = random.randint(0, len(self.utt_da_pairs)-1)
        return self.utt_da_pairs[r]

    def create_vocab(self):
        print("Creating Vocab file for Switchboard")

        cnt = Counter()
        for utt in self.corpus.iter_utterances():
            sentence = re.sub(r"([+/\}\[\]]|\{\w)", "",
                            utt.text)
            sentence = self.tokenizer(sentence)
            for w in sentence:
                cnt[w] += 1

        itos_file = os.path.join(self.data_dir, "itos.txt")
        itosf = open(itos_file, "w")

        for (word, _) in cnt.most_common(25000):
            itosf.write("{}\n".format(word))


    #getKeysByValue
    def swda_permute(self, sents, amount, speaker_ixs):
        if amount == 0:
            return []

        permutations = [list(range(len(sents)))]
        segment_permutations = []
        amount = min(amount, factorial(len(sents))-1)
        segm_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segm_ixs.values()))

        for i in range(amount):
            while True:
                permutation = []
                segm_perm = np.random.permutation(len(segments))
                segment_permutations.append(segm_perm)
                for segm_ix in segm_perm:
                    utt_ixs = sorted(getKeysByValue(segm_ixs, segm_ix))
                    permutation = permutation + utt_ixs

                if permutation not in permutations:
                    break

            permutations.append(permutation)
        return permutations[1:] , segment_permutations #the first one is the original, which was included s.t. won't be generated

    def speaker_segment_ixs(self, speaker_ixs):
        i = 0
        segment_indices = dict()
        prev_speaker = speaker_ixs[0]
        for j,speaker in enumerate(speaker_ixs):
            if speaker != prev_speaker:
                prev_speaker = speaker
                i += 1
            segment_indices[j] = i
        return segment_indices

    def swda_half_perturb(self, amount, speaker_ixs):
        segm_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segm_ixs.values()))
        segment_permutations = []
        permutations = [list(segm_ixs.keys())]
        for _ in range(amount):
            speaker = random.randint(0,1) # choose one of the speakers
            speaker_to_perm = list(filter(lambda x: (x-speaker) % 2 == 0, segments))
            speaker_orig = list(filter(lambda x: (x-speaker) % 2 != 0, segments))
            #TODO: rename either speaker_ix or speaker_ixs, they are something different, but the names are too close
            if len(speaker_to_perm) < 2:
                return []

            while True:
                permuted_speaker_ix = np.random.permutation(speaker_to_perm).tolist()

                new_segments = [None]*(len(speaker_orig)+len(permuted_speaker_ix))
                if speaker == 0 : 
                    new_segments[::2] = permuted_speaker_ix
                    new_segments[1::2] = speaker_orig
                else:
                    new_segments[1::2] = permuted_speaker_ix
                    new_segments[::2] = speaker_orig
                segment_permutations.append(new_segments)

                permutation = []
                for segm_ix in new_segments:
                    utt_ixs = sorted(getKeysByValue(segm_ixs, segm_ix))
                    permutation = permutation + utt_ixs

                if not permutation in permutations:
                    permutations.append(permutation)
                    break

        return permutations, segment_permutations

    def swda_utterance_insertion(self, speaker_ixs, amounts):
        segment_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segment_ixs.values()))
        segment_permutations = []
        permutations = []

        i = 0
        for _ in range(amounts):
            while True: # actually: do ... while permutation not in permutations
                i_from = random.randint(0, len(segments)-1)
                i_to = random.randint(0, len(segments)-2)
                segm_perm = deepcopy(segments)
                rem_elem = segments[i_from]
                segm_perm = segm_perm[0:i_from] + segm_perm[i_from+1:]
                segm_perm = segm_perm[0:i_to] + [rem_elem] + segm_perm[i_to:]

                permutation = []
                for segm_ix in segm_perm:
                    utt_ixs = sorted(getKeysByValue(segment_ixs, segm_ix))
                    permutation = permutation + utt_ixs

                if permutation not in permutations:
                    permutations.append(permutation)
                    segment_permutations.append(segm_perm)
                    break

        return permutations, segment_permutations

    def swda_utterance_sampling(self, utterances, acts, speaker_ixs, amount):
        segm_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segm_ixs.values()))

        permutations = []

        for i in range(amount):
            (sentence, act, swda_name, ix) = self.draw_rand_sent()
            # insert_ix = random.choice(range(len(utterances)))

            while(True):
                insert_ix = np.random.choice(range(len(utterances)))
                utt = utterances[insert_ix]
                act_orig = acts[insert_ix]
                if self.utt_acceptable(utt) and act != act_orig:
                    break

            permutations.append((sentence, act, swda_name, ix, insert_ix))

        return permutations

    def convert_dset(self, amounts):
        # create distinct train/validation/test files. they'll correspond to the created
        # splits from the constructor
        train_output_file = os.path.join(self.data_dir, 'train', 'coherency_dset_{}.txt'.format(self.task))
        val_output_file = os.path.join(self.data_dir, 'validation', 'coherency_dset_{}.txt'.format(self.task))
        test_output_file = os.path.join(self.data_dir, 'test', 'coherency_dset_{}.txt'.format(self.task))
        if not os.path.exists(os.path.join(self.data_dir, 'train')):
            os.makedirs(os.path.join(self.data_dir, 'train'))
        if not os.path.exists(os.path.join(self.data_dir, 'validation')):
            os.makedirs(os.path.join(self.data_dir, 'validation'))
        if not os.path.exists(os.path.join(self.data_dir, 'test')):
            os.makedirs(os.path.join(self.data_dir, 'test'))

        trainfile = open(train_output_file, 'w')
        valfile = open(val_output_file, 'w')
        testfile = open(test_output_file, 'w')

        shuffled_path = os.path.join(self.data_dir, "shuffled_{}".format(self.task))
        if not os.path.isdir(shuffled_path):
            os.mkdir(shuffled_path)

        for i,trans in enumerate(tqdm(self.corpus.iter_transcripts(display_progress=False), total=1155)):
            utterances = []
            acts = []
            speaker_ixs = []
            prev_act = "%"
            for utt in trans.utterances:
                sentence = self.clean_utt(utt.text)
                sentence = self.word2id(sentence)
                # print(sentence, " ## DAs: ", utt.act_tag)
                utterances.append(sentence)
                act = utt.damsl_act_tag()
                if act == None: act = "%"
                if act == "+": act = prev_act
                acts.append(self.da2num[act])
                prev_act = act
                if "A" in utt.caller:
                    speaker_ixs.append(0)
                else:
                    speaker_ixs.append(1)

            if self.task == 'up':
                permuted_ixs , segment_perms = self.swda_permute(utterances, amounts, speaker_ixs)
            elif self.task == 'us':
                permuted_ixs = self.swda_utterance_sampling(utterances, acts, speaker_ixs, amounts)
            elif self.task == 'hup':
                permuted_ixs , segment_perms = self.swda_half_perturb(amounts, speaker_ixs)
            elif self.task == 'ui':
                permuted_ixs, segment_perms = self.swda_utterance_insertion(speaker_ixs, amounts)

            swda_fname = os.path.split(trans.swda_filename)[1]
            shuffle_file = os.path.join(shuffled_path, swda_fname) # [:-4]
            with open(shuffle_file, "w") as f:
                #TODO: analogous to DD, write switchboard name into the file
                csv_writer = csv.writer(f)
                if self.task == 'us':
                    for perm in permuted_ixs:
                        (utt, da, name, ix, insert_ix) = perm
                        row = [name, ix,insert_ix]
                        csv_writer.writerow(row)
                else:
                    for perm in segment_perms:
                        csv_writer.writerow(perm)

            if self.task == 'us':
                for p in permuted_ixs:
                    a = " ".join([str(x) for x in acts])
                    u = str(utterances)
                    # (sentence, act, swda_name, ix, insert_ix)
                    insert_sent, insert_da, name, ix, insert_ix = p
                    insert_da = self.da2num[insert_da]
                    p_a = deepcopy(acts)
                    p_a[insert_ix] = insert_da
                    pa = " ".join([str(x) for x in p_a])
                    p_u = deepcopy(utterances)
                    p_u[insert_ix] = insert_sent

                    if i in self.train_ixs:
                        trainfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u))
                        trainfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u))
                    if i in self.val_ixs:
                        valfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u))
                        valfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u))
                    if i in self.test_ixs:
                        testfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u))
                        testfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u))

            else:
                for p in permuted_ixs:
                    a = " ".join([str(x) for x in acts])
                    u = str(utterances)
                    pa = [acts[i] for i in p]
                    p_a = " ".join([str(x) for x in pa])
                    pu = [utterances[i] for i in p]
                    p_u = str(pu)

                    if i in self.train_ixs:
                        trainfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u))
                        trainfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u))
                    if i in self.val_ixs:
                        valfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u))
                        valfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u))
                    if i in self.test_ixs:
                        testfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u))
                        testfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u))
Example #6
0
# similarity_processor = BertSimilarity()

QUESTIONS = ["qy", "qw", "qo", "qr"]


def get_similarity(sentence1, sentence2):
    print(sentence1, sentence2)
    return 1
    # similarity_processor.get_similarity(sentence1, sentence2)


def is_question(utterance):
    return utterance.damsl_act_tag() in QUESTIONS


if __name__ == '__main__':
    data_dir = "swda/swda"
    scan_range = 5
    cr = CorpusReader("swda1/swda")
    for dialog in cr.iter_transcripts(display_progress=True):
        for index, utterance in enumerate(dialog.utterances):
            if index < scan_range or index >= len(
                    dialog.utterances) - scan_range:
                continue
            if is_question(utterance):
                for i in range(-5, 5):
                    print(
                        get_similarity(utterance.text,
                                       dialog.utterances[index + i].text))
    # pyplot.bar(range(scan_range*2), aggregate)
Example #7
0
    'ng', 'ny', 'qw^d', 'bd', 'qy^d', 'bf', 'ft', 'ba', 'bh', 'bk', 'fa', 'fc',
    'br', 'qh', 'oo', 'b', 'qw', 'qy', 'h', 't3', 'o', 't1', '^h', 'aap', '^q',
    'x', 'sd', '^2', 'qo', '^g'
]
convert = {
    '+': 'sd',
    'fo_o_fw_"_by_bc': 'sd',
    'oo_co_cc': 'sd',
    'arp_nd': 'no',
    'aap_am': 'sd'
}

# assuming SWDA corpus installed in path-to-project/swda
# url of repo is https://github.com/cgpotts/swda
# proprocessor script for this model is in https://github.com/miyamotost/swda
corpus = CorpusReader('swda/swda')

with open('dataset/swda_datset_training.txt',
          mode='a') as f1, open('dataset/swda_datset_test.txt',
                                mode='a') as f2:
    for i, trans in enumerate(corpus.iter_transcripts(display_progress=False)):
        speakerids = [pad, pad, pad, pad]
        utts = [pad, pad, pad, pad]
        labels = [pad, pad, pad, pad]
        print('iter: {}'.format(i + 1))

        #
        # speakerid   : utt.caller_no
        # main_topics : trans.topic_description しばらく"PAD"で対応(無視する)
        # pos         : utt.act_tag しばらく"PAD"で対応(無視する)
        # utt         : utt.text
Example #8
0
            pos = pos_list[k][1]
            add_token(language, sentence, word, lemma, pos, i, j, k + 1)

    tree = etree.ElementTree(text)
    tree.write(out_file,
               pretty_print=True,
               xml_declaration=True,
               encoding='utf-8')


def add_token(language, sentence, token, lemma, pos, i, j, k):
    """
    Converts a CONLL-U token to a OPUS-xml 'w'-tag.
    """
    word = etree.SubElement(sentence, 'w')
    word.text = token
    word.set('id', 'w{}.{}.{}'.format(i, j, k))
    word.set('lem', lemma)
    word.set('tree', pos)


if __name__ == '__main__':
    corpus = CorpusReader('swda/swda')
    for transcript in corpus.iter_transcripts():
        out_file = os.path.splitext(os.path.basename(
            transcript.swda_filename))[0]
        out_path = 'swda-opus'
        os.makedirs(out_path, exist_ok=True)
        process_single('en', transcript,
                       os.path.join(out_path, out_file + '.xml'))
Example #9
0
import logging
from swda.swda import CorpusReader
corpus = CorpusReader('swda/swda')

train_set_idx = [
    'sw2005', 'sw2006', 'sw2008', 'sw2010', 'sw2012', 'sw2015', 'sw2018',
    'sw2019', 'sw2020', 'sw2022', 'sw2024', 'sw2025', 'sw2027', 'sw2028',
    'sw2032', 'sw2035', 'sw2038', 'sw2039', 'sw2040', 'sw2041', 'sw2051',
    'sw2060', 'sw2061', 'sw2062', 'sw2064', 'sw2065', 'sw2073', 'sw2078',
    'sw2079', 'sw2085', 'sw2086', 'sw2090', 'sw2092', 'sw2093', 'sw2094',
    'sw2095', 'sw2101', 'sw2102', 'sw2104', 'sw2105', 'sw2107', 'sw2109',
    'sw2110', 'sw2111', 'sw2113', 'sw2120', 'sw2122', 'sw2124', 'sw2125',
    'sw2130', 'sw2137', 'sw2139', 'sw2145', 'sw2149', 'sw2154', 'sw2155',
    'sw2157', 'sw2168', 'sw2171', 'sw2177', 'sw2178', 'sw2180', 'sw2181',
    'sw2184', 'sw2185', 'sw2187', 'sw2190', 'sw2191', 'sw2197', 'sw2205',
    'sw2220', 'sw2221', 'sw2226', 'sw2227', 'sw2228', 'sw2231', 'sw2232',
    'sw2234', 'sw2235', 'sw2237', 'sw2241', 'sw2244', 'sw2247', 'sw2248',
    'sw2249', 'sw2252', 'sw2259', 'sw2260', 'sw2262', 'sw2263', 'sw2264',
    'sw2265', 'sw2266', 'sw2268', 'sw2275', 'sw2278', 'sw2279', 'sw2283',
    'sw2285', 'sw2287', 'sw2290', 'sw2292', 'sw2293', 'sw2295', 'sw2296',
    'sw2300', 'sw2301', 'sw2302', 'sw2303', 'sw2304', 'sw2305', 'sw2308',
    'sw2309', 'sw2313', 'sw2314', 'sw2316', 'sw2323', 'sw2324', 'sw2325',
    'sw2330', 'sw2331', 'sw2334', 'sw2336', 'sw2339', 'sw2342', 'sw2344',
    'sw2349', 'sw2353', 'sw2354', 'sw2355', 'sw2362', 'sw2365', 'sw2366',
    'sw2368', 'sw2370', 'sw2372', 'sw2376', 'sw2379', 'sw2380', 'sw2382',
    'sw2383', 'sw2386', 'sw2387', 'sw2389', 'sw2393', 'sw2397', 'sw2405',
    'sw2406', 'sw2407', 'sw2413', 'sw2418', 'sw2421', 'sw2423', 'sw2424',
    'sw2426', 'sw2427', 'sw2429', 'sw2431', 'sw2432', 'sw2433', 'sw2435',
    'sw2436', 'sw2437', 'sw2439', 'sw2442', 'sw2445', 'sw2446', 'sw2448',
    'sw2450', 'sw2451', 'sw2452', 'sw2457', 'sw2460', 'sw2465', 'sw2466',
    'sw2467', 'sw2469', 'sw2471', 'sw2472', 'sw2476', 'sw2477', 'sw2478',
Example #10
0
def prep_swda():
    """
    Put the conversations into a json format that torchtext can read easily.
    Each "example" is a conversation comprised of a list of utterances 
    and a list of dialogue act tags (each the same length)
    """

    log.info("Loading SWDA corpus.")
    if not os.path.isfile(SWDA_CORPUS_DIR):
        with zipfile.ZipFile("swda/swda.zip") as zip_ref:
            zip_ref.extractall('data')
    corpus = CorpusReader(SWDA_CORPUS_DIR)
    corpus = {t.conversation_no: t for t in corpus.iter_transcripts()}

    bert_vocab_file = BERT_VOCAB_FILE.format(BERT_MODEL)
    if not os.path.isfile(bert_vocab_file):
        log.info("Customizing BERT vocab.")
        customize_bert_vocab()
    log.info("Loading BERT vocab/tokenizer.")
    bert_tokenizer = BertTokenizer.from_pretrained(bert_vocab_file, 
            never_split = BERT_RESERVED_TOKENS + BERT_CUSTOM_TOKENS)

    log.info("Getting splits.")
    splits_file = SWDA_SPLITS.format('splits')
    if os.path.isfile(splits_file): # use existing SWDA splits (for reproducibility purposes)
        with open(splits_file) as f:
            splits = json.load(f)
    else: # save the splits file
        splits = gen_splits(list(corpus.keys()))
        with open(splits_file, 'w') as f:
            json.dump(splits, f)

    def words_to_ints(ws):
        maxvalue = max(vocab.values())
        for w in ws:
            if w not in vocab:
                maxvalue += 1
                vocab[w] = maxvalue
        xs = [vocab[x] for x in ws]
        return xs

    def tag_to_int(tag):
        maxvalue = max(tag_vocab.values()) if tag_vocab else -1
        if tag not in tag_vocab:
            maxvalue += 1
            tag_vocab[tag] = maxvalue
        return tag_vocab[tag] 

    def extract_example(transcript):
        """ Gets the parts we need from the SWDA utterance object """ 
        tags, tags_ints, utts, utts_ints, utts_ints_bert , utts_ints_nl, utts_ints_bert_nl = [], [], [], [], [], [], []
        for utt in transcript.utterances:
            # Regex tokenization
            words = "[SPKR_{}] ".format(utt.caller) + tokenize(utt.text.lower())
            words_nl = remove_laughters(remove_disfluencies(words))
            utts.append(words)
            utts_ints.append(words_to_ints(words.split()))
            utts_ints_nl.append(words_to_ints(words_nl.split()))
            # BERT wordpiece tokenization
            bert_text = "[CLS] [SPKR_{}] ".format(utt.caller) + utt.text
            bert_tokens = bert_tokenizer.tokenize(bert_text) # list of strings
            utts_ints_bert.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens))
            bert_text_nl = remove_laughters(remove_disfluencies(bert_text))
            bert_tokens_nl = bert_tokenizer.tokenize(bert_text_nl)
            utts_ints_bert_nl.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens_nl))
            # dialogue act tags
            tag = damsl_tag_cluster(utt.act_tag)
            tags.append(tag)
            tags_ints.append(tag_to_int(tag))
        return {'id': transcript.conversation_no, 'utts': utts, 'utts_ints': utts_ints, 
                'utts_ints_bert': utts_ints_bert, 'tags': tags, 'tags_ints': tags_ints,
                'utts_ints_bert_nl': utts_ints_bert_nl, 'utts_ints_nl': utts_ints_nl}

    log.info("Extracting data and saving splits.")
    for split in splits:
        data = []
        for ex_id in tqdm(splits[split], desc=split):
            data.append(extract_example(corpus[ex_id]))
        with open(SWDA_SPLITS.format(split), 'w') as f:
            json.dump(data, f)
    log.info("Vocab size: {}". format(len(vocab)))
    with open(SWDA_SPLITS.format("vocab"), 'w') as f:
        json.dump(vocab, f)
    log.info("Tag vocab size: {}". format(len(tag_vocab)))
    with open(SWDA_SPLITS.format("tag_vocab"), 'w') as f:
        json.dump(tag_vocab, f)