def load_swda_corpus_data(swda_directory):
    print('Loading SwDA Corpus...')
    corpus_reader = CorpusReader(swda_directory)

    talks = []
    talk_names = []
    tags_seen = set()
    tag_occurances = {}
    for transcript in corpus_reader.iter_transcripts(False):
        name = 'sw' + str(transcript.conversation_no)
        talk_names.append(name)
        conversation_content = []
        conversation_tags = []
        for utterance in transcript.utterances:
            conversation_content.append( utterance.text_words(True) )
            tag = utterance.damsl_act_tag()
            conversation_tags.append( tag )
            if tag not in tags_seen:
                tags_seen.add(tag)
                tag_occurances[tag] = 1
            else:
                tag_occurances[tag] += 1
        talks.append( (conversation_content, conversation_tags) )

    print('\nFound ' + str(len(tags_seen))+ ' different utterance tags.\n')

    tag_indices = {tag:i for i, tag in enumerate(sorted(list(tags_seen)))}

    for talk in talks:
        talk_tags = talk[1]
        for i, tag in enumerate(talk_tags):
            talk_tags[i] = tag_indices[ tag ]

    print('Loaded SwDA Corpus.')
    return talks, talk_names, tag_indices, tag_occurances
Ejemplo n.º 2
0
def load_swda_corpus_data(swda_directory):
    print('Loading SwDA Corpus...')
    corpus_reader = CorpusReader(swda_directory)

    talks = []
    talk_names = []
    tags_seen = {}
    tag_occurances = {}
    num_tags_seen = 0
    X=[]
    Y=[]
    for transcript in corpus_reader.iter_transcripts(False):
        name = 'sw' + str(transcript.conversation_no)
        talk_names.append(name)
        conversation_content = []
        conversation_tags = []
        for utterance in transcript.utterances:
            conversation_content.append( utterance.text_words(True) )
            tag = utterance.damsl_act_tag()
            conversation_tags.append( tag )
            if tag not in tags_seen:
                tags_seen[tag] = num_tags_seen
                num_tags_seen += 1
                tag_occurances[tag] = 1
            else:
                tag_occurances[tag] += 1

        talks.append( (conversation_content, conversation_tags) )
        X.append(conversation_content)
        Y.append(conversation_tags)

    print('\nFound ' + str(len(tags_seen))+ ' different utterance tags.\n')

    for talk in talks:
        conversation_tags = talk[1]
        for i in range(len(conversation_tags)):
            conversation_tags[i] = tags_seen[ conversation_tags[i] ]

    print('Loaded SwDA Corpus.')
    return X,Y,talks, talk_names, tags_seen, tag_occurances
Ejemplo n.º 3
0
class SwitchboardConverter:
    def __init__(self, data_dir, tokenizer, word2id, task='', seed=42):
        self.corpus = CorpusReader(data_dir)
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.word2id = word2id
        self.task = task

        # self.utt_num = 0
        # for utt in self.corpus.iter_utterances():
            # self.utt_num += 1

        self.trans_num = 1155
        self.deleted_utterances = 0
        self.deleted_tokens = 0
        self.in_tokens = 0
        # for trans in self.corpus.iter_transcripts():
            # self.trans_num += 1

        self.da2num = switchboard_da_mapping()

        self.stopwords = get_stopwords(data_dir, word2id)

        # CAUTION: make sure that for each task the seed is the same s.t. the splits will be the same!
        train_ixs, val_ixs = train_test_split(range(self.trans_num), shuffle=True, train_size=0.8, random_state=seed)
        val_ixs, test_ixs = train_test_split(val_ixs, shuffle=True, train_size=0.5, random_state=seed)
        self.train_ixs, self.val_ixs, self.test_ixs = train_ixs, val_ixs, test_ixs

        self.utt_da_pairs = []
        prev_da = "%"
        for i, utt in enumerate(self.corpus.iter_utterances()):
            sentence = self.clean_utt(utt.text)
            if not sentence: continue
            
            self.in_tokens += len(sentence)
            sentence = self.word2id(sentence)
            if len(sentence) == 0:
                continue

            if not self.utt_acceptable(sentence):
                continue

            act = utt.damsl_act_tag()
            if act == None: act = "%"
            if act == "+": act = prev_da

            _, swda_name = os.path.split(utt.swda_filename)
            swda_name = swda_name[:-4] if swda_name.endswith('.csv') else swda_name

            ix = utt.utterance_index

            self.utt_da_pairs.append((sentence, act, swda_name, ix))

    def clean_utt(self, utterance):
        utterance, cnt = re.subn(r"([+/\}\[\],\-\(\)#]|\{\w)", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"<+.*>+", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"\*\w+", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r">[\s\w'?]+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"\*.+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"\^\w+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"^uh+$", "", utterance) ; self.deleted_tokens += cnt
        utterance, cnt = re.subn(r"(uh+)", "", utterance) ; self.deleted_tokens += cnt
        
        ml = re.search("(^\s*\.\s*$)|>", utterance)
        if ml:
            self.deleted_utterances += 1
            return None
        utterance = [w.lower() for w in utterance.split(" ") if len(w) > 0 and not re.search("[Uu][Hh]+", w)]

        return utterance

    def utt_acceptable(self, utt):
        # check whether an utterance is acceptable for perturbation
        if not utt: return False

        stop_cnt = 0
        for w in utt:
            if w in self.stopwords:
                stop_cnt += 1

        if  (len(utt)-stop_cnt >= 4): # (float(stop_cnt) / float(len(utt))) < 0.999 and
            return True
        return False

    def draw_rand_sent(self):
        r = random.randint(0, len(self.utt_da_pairs)-1)
        return self.utt_da_pairs[r]

    def create_vocab(self):
        print("Creating Vocab file for Switchboard")

        cnt = Counter()
        for utt in self.corpus.iter_utterances():
            sentence = re.sub(r"([+/\}\[\]]|\{\w)", "",
                            utt.text)
            sentence = self.tokenizer(sentence)
            for w in sentence:
                cnt[w] += 1

        itos_file = os.path.join(self.data_dir, "itos.txt")
        itosf = open(itos_file, "w")

        for (word, _) in cnt.most_common(25000):
            itosf.write("{}\n".format(word))


    #getKeysByValue
    def swda_permute(self, sents, amount, speaker_ixs):
        if amount == 0:
            return []

        permutations = [list(range(len(sents)))]
        segment_permutations = []
        amount = min(amount, factorial(len(sents))-1)
        segm_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segm_ixs.values()))

        for i in range(amount):
            while True:
                permutation = []
                segm_perm = np.random.permutation(len(segments))
                segment_permutations.append(segm_perm)
                for segm_ix in segm_perm:
                    utt_ixs = sorted(getKeysByValue(segm_ixs, segm_ix))
                    permutation = permutation + utt_ixs

                if permutation not in permutations:
                    break

            permutations.append(permutation)
        return permutations[1:] , segment_permutations #the first one is the original, which was included s.t. won't be generated

    def speaker_segment_ixs(self, speaker_ixs):
        i = 0
        segment_indices = dict()
        prev_speaker = speaker_ixs[0]
        for j,speaker in enumerate(speaker_ixs):
            if speaker != prev_speaker:
                prev_speaker = speaker
                i += 1
            segment_indices[j] = i
        return segment_indices

    def swda_half_perturb(self, amount, speaker_ixs):
        segm_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segm_ixs.values()))
        segment_permutations = []
        permutations = [list(segm_ixs.keys())]
        for _ in range(amount):
            speaker = random.randint(0,1) # choose one of the speakers
            speaker_to_perm = list(filter(lambda x: (x-speaker) % 2 == 0, segments))
            speaker_orig = list(filter(lambda x: (x-speaker) % 2 != 0, segments))
            #TODO: rename either speaker_ix or speaker_ixs, they are something different, but the names are too close
            if len(speaker_to_perm) < 2:
                return []

            while True:
                permuted_speaker_ix = np.random.permutation(speaker_to_perm).tolist()

                new_segments = [None]*(len(speaker_orig)+len(permuted_speaker_ix))
                if speaker == 0 : 
                    new_segments[::2] = permuted_speaker_ix
                    new_segments[1::2] = speaker_orig
                else:
                    new_segments[1::2] = permuted_speaker_ix
                    new_segments[::2] = speaker_orig
                segment_permutations.append(new_segments)

                permutation = []
                for segm_ix in new_segments:
                    utt_ixs = sorted(getKeysByValue(segm_ixs, segm_ix))
                    permutation = permutation + utt_ixs

                if not permutation in permutations:
                    permutations.append(permutation)
                    break

        return permutations, segment_permutations

    def swda_utterance_insertion(self, speaker_ixs, amounts):
        segment_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segment_ixs.values()))
        segment_permutations = []
        permutations = []

        i = 0
        for _ in range(amounts):
            while True: # actually: do ... while permutation not in permutations
                i_from = random.randint(0, len(segments)-1)
                i_to = random.randint(0, len(segments)-2)
                segm_perm = deepcopy(segments)
                rem_elem = segments[i_from]
                segm_perm = segm_perm[0:i_from] + segm_perm[i_from+1:]
                segm_perm = segm_perm[0:i_to] + [rem_elem] + segm_perm[i_to:]

                permutation = []
                for segm_ix in segm_perm:
                    utt_ixs = sorted(getKeysByValue(segment_ixs, segm_ix))
                    permutation = permutation + utt_ixs

                if permutation not in permutations:
                    permutations.append(permutation)
                    segment_permutations.append(segm_perm)
                    break

        return permutations, segment_permutations

    def swda_utterance_sampling(self, utterances, acts, speaker_ixs, amount):
        segm_ixs = self.speaker_segment_ixs(speaker_ixs)
        segments = list(set(segm_ixs.values()))

        permutations = []

        for i in range(amount):
            (sentence, act, swda_name, ix) = self.draw_rand_sent()
            # insert_ix = random.choice(range(len(utterances)))

            while(True):
                insert_ix = np.random.choice(range(len(utterances)))
                utt = utterances[insert_ix]
                act_orig = acts[insert_ix]
                if self.utt_acceptable(utt) and act != act_orig:
                    break

            permutations.append((sentence, act, swda_name, ix, insert_ix))

        return permutations

    def convert_dset(self, amounts):
        # create distinct train/validation/test files. they'll correspond to the created
        # splits from the constructor
        train_output_file = os.path.join(self.data_dir, 'train', 'coherency_dset_{}.txt'.format(self.task))
        val_output_file = os.path.join(self.data_dir, 'validation', 'coherency_dset_{}.txt'.format(self.task))
        test_output_file = os.path.join(self.data_dir, 'test', 'coherency_dset_{}.txt'.format(self.task))
        if not os.path.exists(os.path.join(self.data_dir, 'train')):
            os.makedirs(os.path.join(self.data_dir, 'train'))
        if not os.path.exists(os.path.join(self.data_dir, 'validation')):
            os.makedirs(os.path.join(self.data_dir, 'validation'))
        if not os.path.exists(os.path.join(self.data_dir, 'test')):
            os.makedirs(os.path.join(self.data_dir, 'test'))

        trainfile = open(train_output_file, 'w')
        valfile = open(val_output_file, 'w')
        testfile = open(test_output_file, 'w')

        shuffled_path = os.path.join(self.data_dir, "shuffled_{}".format(self.task))
        if not os.path.isdir(shuffled_path):
            os.mkdir(shuffled_path)

        for i,trans in enumerate(tqdm(self.corpus.iter_transcripts(display_progress=False), total=1155)):
            utterances = []
            acts = []
            speaker_ixs = []
            prev_act = "%"
            for utt in trans.utterances:
                sentence = self.clean_utt(utt.text)
                sentence = self.word2id(sentence)
                # print(sentence, " ## DAs: ", utt.act_tag)
                utterances.append(sentence)
                act = utt.damsl_act_tag()
                if act == None: act = "%"
                if act == "+": act = prev_act
                acts.append(self.da2num[act])
                prev_act = act
                if "A" in utt.caller:
                    speaker_ixs.append(0)
                else:
                    speaker_ixs.append(1)

            if self.task == 'up':
                permuted_ixs , segment_perms = self.swda_permute(utterances, amounts, speaker_ixs)
            elif self.task == 'us':
                permuted_ixs = self.swda_utterance_sampling(utterances, acts, speaker_ixs, amounts)
            elif self.task == 'hup':
                permuted_ixs , segment_perms = self.swda_half_perturb(amounts, speaker_ixs)
            elif self.task == 'ui':
                permuted_ixs, segment_perms = self.swda_utterance_insertion(speaker_ixs, amounts)

            swda_fname = os.path.split(trans.swda_filename)[1]
            shuffle_file = os.path.join(shuffled_path, swda_fname) # [:-4]
            with open(shuffle_file, "w") as f:
                #TODO: analogous to DD, write switchboard name into the file
                csv_writer = csv.writer(f)
                if self.task == 'us':
                    for perm in permuted_ixs:
                        (utt, da, name, ix, insert_ix) = perm
                        row = [name, ix,insert_ix]
                        csv_writer.writerow(row)
                else:
                    for perm in segment_perms:
                        csv_writer.writerow(perm)

            if self.task == 'us':
                for p in permuted_ixs:
                    a = " ".join([str(x) for x in acts])
                    u = str(utterances)
                    # (sentence, act, swda_name, ix, insert_ix)
                    insert_sent, insert_da, name, ix, insert_ix = p
                    insert_da = self.da2num[insert_da]
                    p_a = deepcopy(acts)
                    p_a[insert_ix] = insert_da
                    pa = " ".join([str(x) for x in p_a])
                    p_u = deepcopy(utterances)
                    p_u[insert_ix] = insert_sent

                    if i in self.train_ixs:
                        trainfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u))
                        trainfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u))
                    if i in self.val_ixs:
                        valfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u))
                        valfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u))
                    if i in self.test_ixs:
                        testfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,pa,p_u))
                        testfile.write("{}|{}|{}|{}|{}\n".format("1",pa,p_u,a,u))

            else:
                for p in permuted_ixs:
                    a = " ".join([str(x) for x in acts])
                    u = str(utterances)
                    pa = [acts[i] for i in p]
                    p_a = " ".join([str(x) for x in pa])
                    pu = [utterances[i] for i in p]
                    p_u = str(pu)

                    if i in self.train_ixs:
                        trainfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u))
                        trainfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u))
                    if i in self.val_ixs:
                        valfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u))
                        valfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u))
                    if i in self.test_ixs:
                        testfile.write("{}|{}|{}|{}|{}\n".format("0",a,u,p_a,p_u))
                        testfile.write("{}|{}|{}|{}|{}\n".format("1",p_a,p_u,a,u))
Ejemplo n.º 4
0
# similarity_processor = BertSimilarity()

QUESTIONS = ["qy", "qw", "qo", "qr"]


def get_similarity(sentence1, sentence2):
    print(sentence1, sentence2)
    return 1
    # similarity_processor.get_similarity(sentence1, sentence2)


def is_question(utterance):
    return utterance.damsl_act_tag() in QUESTIONS


if __name__ == '__main__':
    data_dir = "swda/swda"
    scan_range = 5
    cr = CorpusReader("swda1/swda")
    for dialog in cr.iter_transcripts(display_progress=True):
        for index, utterance in enumerate(dialog.utterances):
            if index < scan_range or index >= len(
                    dialog.utterances) - scan_range:
                continue
            if is_question(utterance):
                for i in range(-5, 5):
                    print(
                        get_similarity(utterance.text,
                                       dialog.utterances[index + i].text))
    # pyplot.bar(range(scan_range*2), aggregate)
Ejemplo n.º 5
0
    '+': 'sd',
    'fo_o_fw_"_by_bc': 'sd',
    'oo_co_cc': 'sd',
    'arp_nd': 'no',
    'aap_am': 'sd'
}

# assuming SWDA corpus installed in path-to-project/swda
# url of repo is https://github.com/cgpotts/swda
# proprocessor script for this model is in https://github.com/miyamotost/swda
corpus = CorpusReader('swda/swda')

with open('dataset/swda_datset_training.txt',
          mode='a') as f1, open('dataset/swda_datset_test.txt',
                                mode='a') as f2:
    for i, trans in enumerate(corpus.iter_transcripts(display_progress=False)):
        speakerids = [pad, pad, pad, pad]
        utts = [pad, pad, pad, pad]
        labels = [pad, pad, pad, pad]
        print('iter: {}'.format(i + 1))

        #
        # speakerid   : utt.caller_no
        # main_topics : trans.topic_description しばらく"PAD"で対応(無視する)
        # pos         : utt.act_tag しばらく"PAD"で対応(無視する)
        # utt         : utt.text
        # label       : utt.act_tagm, utt.damsl_act_tag() モデルで使用されていないlabelは使用されているものに変換する
        #

        for utt in trans.utterances:
            speakerids.append(str(utt.caller_no))
Ejemplo n.º 6
0
            pos = pos_list[k][1]
            add_token(language, sentence, word, lemma, pos, i, j, k + 1)

    tree = etree.ElementTree(text)
    tree.write(out_file,
               pretty_print=True,
               xml_declaration=True,
               encoding='utf-8')


def add_token(language, sentence, token, lemma, pos, i, j, k):
    """
    Converts a CONLL-U token to a OPUS-xml 'w'-tag.
    """
    word = etree.SubElement(sentence, 'w')
    word.text = token
    word.set('id', 'w{}.{}.{}'.format(i, j, k))
    word.set('lem', lemma)
    word.set('tree', pos)


if __name__ == '__main__':
    corpus = CorpusReader('swda/swda')
    for transcript in corpus.iter_transcripts():
        out_file = os.path.splitext(os.path.basename(
            transcript.swda_filename))[0]
        out_path = 'swda-opus'
        os.makedirs(out_path, exist_ok=True)
        process_single('en', transcript,
                       os.path.join(out_path, out_file + '.xml'))
Ejemplo n.º 7
0
def prep_swda():
    """
    Put the conversations into a json format that torchtext can read easily.
    Each "example" is a conversation comprised of a list of utterances 
    and a list of dialogue act tags (each the same length)
    """

    log.info("Loading SWDA corpus.")
    if not os.path.isfile(SWDA_CORPUS_DIR):
        with zipfile.ZipFile("swda/swda.zip") as zip_ref:
            zip_ref.extractall('data')
    corpus = CorpusReader(SWDA_CORPUS_DIR)
    corpus = {t.conversation_no: t for t in corpus.iter_transcripts()}

    bert_vocab_file = BERT_VOCAB_FILE.format(BERT_MODEL)
    if not os.path.isfile(bert_vocab_file):
        log.info("Customizing BERT vocab.")
        customize_bert_vocab()
    log.info("Loading BERT vocab/tokenizer.")
    bert_tokenizer = BertTokenizer.from_pretrained(bert_vocab_file, 
            never_split = BERT_RESERVED_TOKENS + BERT_CUSTOM_TOKENS)

    log.info("Getting splits.")
    splits_file = SWDA_SPLITS.format('splits')
    if os.path.isfile(splits_file): # use existing SWDA splits (for reproducibility purposes)
        with open(splits_file) as f:
            splits = json.load(f)
    else: # save the splits file
        splits = gen_splits(list(corpus.keys()))
        with open(splits_file, 'w') as f:
            json.dump(splits, f)

    def words_to_ints(ws):
        maxvalue = max(vocab.values())
        for w in ws:
            if w not in vocab:
                maxvalue += 1
                vocab[w] = maxvalue
        xs = [vocab[x] for x in ws]
        return xs

    def tag_to_int(tag):
        maxvalue = max(tag_vocab.values()) if tag_vocab else -1
        if tag not in tag_vocab:
            maxvalue += 1
            tag_vocab[tag] = maxvalue
        return tag_vocab[tag] 

    def extract_example(transcript):
        """ Gets the parts we need from the SWDA utterance object """ 
        tags, tags_ints, utts, utts_ints, utts_ints_bert , utts_ints_nl, utts_ints_bert_nl = [], [], [], [], [], [], []
        for utt in transcript.utterances:
            # Regex tokenization
            words = "[SPKR_{}] ".format(utt.caller) + tokenize(utt.text.lower())
            words_nl = remove_laughters(remove_disfluencies(words))
            utts.append(words)
            utts_ints.append(words_to_ints(words.split()))
            utts_ints_nl.append(words_to_ints(words_nl.split()))
            # BERT wordpiece tokenization
            bert_text = "[CLS] [SPKR_{}] ".format(utt.caller) + utt.text
            bert_tokens = bert_tokenizer.tokenize(bert_text) # list of strings
            utts_ints_bert.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens))
            bert_text_nl = remove_laughters(remove_disfluencies(bert_text))
            bert_tokens_nl = bert_tokenizer.tokenize(bert_text_nl)
            utts_ints_bert_nl.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens_nl))
            # dialogue act tags
            tag = damsl_tag_cluster(utt.act_tag)
            tags.append(tag)
            tags_ints.append(tag_to_int(tag))
        return {'id': transcript.conversation_no, 'utts': utts, 'utts_ints': utts_ints, 
                'utts_ints_bert': utts_ints_bert, 'tags': tags, 'tags_ints': tags_ints,
                'utts_ints_bert_nl': utts_ints_bert_nl, 'utts_ints_nl': utts_ints_nl}

    log.info("Extracting data and saving splits.")
    for split in splits:
        data = []
        for ex_id in tqdm(splits[split], desc=split):
            data.append(extract_example(corpus[ex_id]))
        with open(SWDA_SPLITS.format(split), 'w') as f:
            json.dump(data, f)
    log.info("Vocab size: {}". format(len(vocab)))
    with open(SWDA_SPLITS.format("vocab"), 'w') as f:
        json.dump(vocab, f)
    log.info("Tag vocab size: {}". format(len(tag_vocab)))
    with open(SWDA_SPLITS.format("tag_vocab"), 'w') as f:
        json.dump(tag_vocab, f)