print 'tags', args.label_rep_file use_timing_data = False if "timings" in args.corpus_file: dialogues = load_data_from_corpus_file(args.corpus_file) use_timing_data = True else: print "no timings" IDs, timings, seq, pos_seq, targets = \ load_data_from_disfluency_corpus_file( args.corpus_file, convert_to_dnn_format=True) raw_dialogues = sort_into_dialogue_speakers(IDs, timings, seq, pos_seq, targets, add_uttseg="uttseg" in args.label_rep_file, add_dialogue_acts="dact" in args.label_rep_file) dialogues = [] for conv_no, indices, lex_data, pos_data, labels in raw_dialogues: frames = indices dialogues.append( (conv_no, (frames, lex_data, pos_data, indices, labels))) save_feature_matrices(args.matrices_folder, dialogues, use_timing_data=use_timing_data, word_2_idx_dict=word_dict, pos_2_idx_dict=pos_dict,
# "/../data/raw_data/swbd_swda2MS_mapping_temp" #make that dir if it doesn't exist #if not os.path.isdir(mapping_dir): # os.mkdir(mapping_dir) ranges = sorted([line.strip("\n") for line in open(range_file)]) print len(ranges), "files to process" dialogue_speakers = [] #for disf_file in disfluency_files: IDs, mappings, utts, pos_tags, labels = \ load_data_from_disfluency_corpus_file(disf_file) # print labels dialogue_speakers.extend(sort_into_dialogue_speakers(IDs, mappings, utts, pos_tags, labels, convert_to_dnn_tags=False)) print len(dialogue_speakers), "dialogue speakers" #The main loop- has every word in both formats, needs to map from MS file # timings to the SWDA ones #Given the original SWDA transcripts are IN the MSaligned files, it's safer #to map to them, #then use the delete/insert/sub operations in the files to get the pointer. #So the mapping is MSnew -> MSSWDAold -> SWDAold, #where the last mapping is done through min. edit distance string alignment tests = ["2549B"] print "Creating word timing aligned corpus..." for dialogue_triple in sorted(dialogue_speakers, key=lambda x: x[0]): dialogue_speakerID, SWDAindices, SWDAwords, SWDApos, SWDAlabels = \ dialogue_triple origSWDAindices = deepcopy(SWDAindices)
joint_tag_rep_file = disf_tag_rep_file if "timings" in args.corpusFile: dialogues = load_data_from_corpus_file(corpus_file, convert_to_dnn_format=False) _, words, pos, _, labels = concat_all_data_all_speakers( dialogues, divide_into_utts=not args.uttSeg, convert_to_dnn_format=True) else: IDs, timings, seq, pos_seq, targets = \ load_data_from_disfluency_corpus_file( corpus_file, convert_to_dnn_format=True) dialogues = sort_into_dialogue_speakers(IDs, timings, seq, pos_seq, targets) words = [x[2] for x in dialogues] pos = [x[3] for x in dialogues] labels = [x[4] for x in dialogues] create_tag_representations(disf_tag_rep_file, labels, representation="disf1", tag_corpus_file=disf_tag_rep_file.replace( "tags", "tag_corpus")) # simple create_tag_representations( disf_tag_rep_file.replace("_tags", "_simple_tags"), labels, representation="disf1_simple", tag_corpus_file=disf_tag_rep_file.replace("_tags",
test_file = open(asr_dir + "SWDisfTest_increco.text", "w") leftout_ranges = [line.strip("\n") for line in open(asr_dir + "leftout_asr")] #split the big disfluency marked -up files into individual file tuples #it is possible to do the matching on the utterance level as they should have consistent mark-up between the two disf_dir = "../data/disfluency_detection/switchboard" disfluency_files = [ disf_dir + "/swbd_heldout_partial_data.csv", disf_dir + "/swbd_test_partial_data.csv" ] dialogue_speakers = [] for key, disf_file in zip(["heldout", "test"], disfluency_files): IDs, mappings, utts, pos_tags, labels = load_data_from_disfluency_corpus_file( disf_file) dialogue_speakers.extend( sort_into_dialogue_speakers(IDs, mappings, utts, pos_tags, labels)) word_pos_data = {} #map from the file name to the data for data in dialogue_speakers: dialogue, a, b, c, d = data word_pos_data[dialogue] = (a, b, c, d) for key in sorted(word_pos_data.keys()): print key print word_pos_data[key][1][:100] break #quit() average_wer = [] leftout = [] pair = [] #pair of files for filename in sorted(os.listdir(asr_dir)):