Exemple #1
0
        #path to the mappings folder we want to write the mapping files to
        #this can be temp and then removed.
        #mapping_dir = os.path.dirname(os.path.realpath(__file__)) +\
        #                "/../data/raw_data/swbd_swda2MS_mapping_temp"
        #make that dir if it doesn't exist
        #if not os.path.isdir(mapping_dir):
        #    os.mkdir(mapping_dir)

    ranges = sorted([line.strip("\n") for line in open(range_file)])

    print len(ranges), "files to process"

    dialogue_speakers = []
    #for disf_file in disfluency_files:
    IDs, mappings, utts, pos_tags, labels = \
            load_data_from_disfluency_corpus_file(disf_file)
    # print labels
    dialogue_speakers.extend(sort_into_dialogue_speakers(IDs, mappings, utts,
                                                         pos_tags, labels,
                                                 convert_to_dnn_tags=False))
    print len(dialogue_speakers), "dialogue speakers"
    #The main loop- has every word in both formats, needs to map from MS file
    # timings to the SWDA ones
    #Given the original SWDA transcripts are IN the MSaligned files, it's safer 
    #to map to them,
    #then use the delete/insert/sub operations in the files to get the pointer.
    #So the mapping is MSnew -> MSSWDAold -> SWDAold, 
    #where the last mapping is done through min. edit distance string alignment
    tests = ["2549B"]
    print "Creating word timing aligned corpus..."
    for dialogue_triple in sorted(dialogue_speakers, key=lambda x: x[0]):
Exemple #2
0
    args = parser.parse_args()

    word_dict = load_word_rep(args.word_rep_file)
    pos_dict = load_word_rep(args.pos_rep_file)
    label_dict = load_tags(args.label_rep_file)

    print 'tags', args.label_rep_file
    use_timing_data = False
    if "timings" in args.corpus_file:
        dialogues = load_data_from_corpus_file(args.corpus_file)
        use_timing_data = True
    else:
        print "no timings"
        IDs, timings, seq, pos_seq, targets = \
            load_data_from_disfluency_corpus_file(
                                                args.corpus_file,
                                                convert_to_dnn_format=True)
        raw_dialogues = sort_into_dialogue_speakers(IDs,
                                                    timings,
                                                    seq,
                                                    pos_seq,
                                                    targets,
                                                    add_uttseg="uttseg"
                                                    in args.label_rep_file,
                                                    add_dialogue_acts="dact"
                                                    in args.label_rep_file)
        dialogues = []
        for conv_no, indices, lex_data, pos_data, labels in raw_dialogues:
            frames = indices
            dialogues.append(
                (conv_no, (frames, lex_data, pos_data, indices, labels)))
]

heldout_file = open(asr_dir + "SWDisfHeldout_increco.text", "w")
test_file = open(asr_dir + "SWDisfTest_increco.text", "w")
leftout_ranges = [line.strip("\n") for line in open(asr_dir + "leftout_asr")]

#split the big disfluency marked -up files into individual file tuples
#it is possible to do the matching on the utterance level as they should have consistent mark-up between the two
disf_dir = "../data/disfluency_detection/switchboard"
disfluency_files = [
    disf_dir + "/swbd_heldout_partial_data.csv",
    disf_dir + "/swbd_test_partial_data.csv"
]
dialogue_speakers = []
for key, disf_file in zip(["heldout", "test"], disfluency_files):
    IDs, mappings, utts, pos_tags, labels = load_data_from_disfluency_corpus_file(
        disf_file)
    dialogue_speakers.extend(
        sort_into_dialogue_speakers(IDs, mappings, utts, pos_tags, labels))
word_pos_data = {}  #map from the file name to the data
for data in dialogue_speakers:
    dialogue, a, b, c, d = data
    word_pos_data[dialogue] = (a, b, c, d)

for key in sorted(word_pos_data.keys()):
    print key
    print word_pos_data[key][1][:100]
    break
#quit()

average_wer = []
leftout = []