Example #1
0
    print 'tags', args.label_rep_file
    use_timing_data = False
    if "timings" in args.corpus_file:
        dialogues = load_data_from_corpus_file(args.corpus_file)
        use_timing_data = True
    else:
        print "no timings"
        IDs, timings, seq, pos_seq, targets = \
            load_data_from_disfluency_corpus_file(
                                                args.corpus_file,
                                                convert_to_dnn_format=True)
        raw_dialogues = sort_into_dialogue_speakers(IDs,
                                                    timings,
                                                    seq,
                                                    pos_seq,
                                                    targets,
                                                    add_uttseg="uttseg"
                                                    in args.label_rep_file,
                                                    add_dialogue_acts="dact"
                                                    in args.label_rep_file)
        dialogues = []
        for conv_no, indices, lex_data, pos_data, labels in raw_dialogues:
            frames = indices
            dialogues.append(
                (conv_no, (frames, lex_data, pos_data, indices, labels)))

    save_feature_matrices(args.matrices_folder,
                          dialogues,
                          use_timing_data=use_timing_data,
                          word_2_idx_dict=word_dict,
                          pos_2_idx_dict=pos_dict,
Example #2
0
        #                "/../data/raw_data/swbd_swda2MS_mapping_temp"
        #make that dir if it doesn't exist
        #if not os.path.isdir(mapping_dir):
        #    os.mkdir(mapping_dir)

    ranges = sorted([line.strip("\n") for line in open(range_file)])

    print len(ranges), "files to process"

    dialogue_speakers = []
    #for disf_file in disfluency_files:
    IDs, mappings, utts, pos_tags, labels = \
            load_data_from_disfluency_corpus_file(disf_file)
    # print labels
    dialogue_speakers.extend(sort_into_dialogue_speakers(IDs, mappings, utts,
                                                         pos_tags, labels,
                                                 convert_to_dnn_tags=False))
    print len(dialogue_speakers), "dialogue speakers"
    #The main loop- has every word in both formats, needs to map from MS file
    # timings to the SWDA ones
    #Given the original SWDA transcripts are IN the MSaligned files, it's safer 
    #to map to them,
    #then use the delete/insert/sub operations in the files to get the pointer.
    #So the mapping is MSnew -> MSSWDAold -> SWDAold, 
    #where the last mapping is done through min. edit distance string alignment
    tests = ["2549B"]
    print "Creating word timing aligned corpus..."
    for dialogue_triple in sorted(dialogue_speakers, key=lambda x: x[0]):
        dialogue_speakerID, SWDAindices, SWDAwords, SWDApos, SWDAlabels = \
                                                        dialogue_triple
        origSWDAindices = deepcopy(SWDAindices)
Example #3
0
    joint_tag_rep_file = disf_tag_rep_file

    if "timings" in args.corpusFile:
        dialogues = load_data_from_corpus_file(corpus_file,
                                               convert_to_dnn_format=False)
        _, words, pos, _, labels = concat_all_data_all_speakers(
            dialogues,
            divide_into_utts=not args.uttSeg,
            convert_to_dnn_format=True)

    else:
        IDs, timings, seq, pos_seq, targets = \
                load_data_from_disfluency_corpus_file(
                                                corpus_file,
                                                convert_to_dnn_format=True)
        dialogues = sort_into_dialogue_speakers(IDs, timings, seq, pos_seq,
                                                targets)
        words = [x[2] for x in dialogues]
        pos = [x[3] for x in dialogues]
        labels = [x[4] for x in dialogues]

    create_tag_representations(disf_tag_rep_file,
                               labels,
                               representation="disf1",
                               tag_corpus_file=disf_tag_rep_file.replace(
                                   "tags", "tag_corpus"))
    # simple
    create_tag_representations(
        disf_tag_rep_file.replace("_tags", "_simple_tags"),
        labels,
        representation="disf1_simple",
        tag_corpus_file=disf_tag_rep_file.replace("_tags",
test_file = open(asr_dir + "SWDisfTest_increco.text", "w")
leftout_ranges = [line.strip("\n") for line in open(asr_dir + "leftout_asr")]

#split the big disfluency marked -up files into individual file tuples
#it is possible to do the matching on the utterance level as they should have consistent mark-up between the two
disf_dir = "../data/disfluency_detection/switchboard"
disfluency_files = [
    disf_dir + "/swbd_heldout_partial_data.csv",
    disf_dir + "/swbd_test_partial_data.csv"
]
dialogue_speakers = []
for key, disf_file in zip(["heldout", "test"], disfluency_files):
    IDs, mappings, utts, pos_tags, labels = load_data_from_disfluency_corpus_file(
        disf_file)
    dialogue_speakers.extend(
        sort_into_dialogue_speakers(IDs, mappings, utts, pos_tags, labels))
word_pos_data = {}  #map from the file name to the data
for data in dialogue_speakers:
    dialogue, a, b, c, d = data
    word_pos_data[dialogue] = (a, b, c, d)

for key in sorted(word_pos_data.keys()):
    print key
    print word_pos_data[key][1][:100]
    break
#quit()

average_wer = []
leftout = []
pair = []  #pair of files
for filename in sorted(os.listdir(asr_dir)):