def walk_tree(dir): results = [] fail_count = 0 with open('errors.txt', 'wb') as err_out: for root, dirs, files in os.walk(dir): for file in files: if file.endswith(".cha"): print "parsing file: {}".format(file) clan_file = pyclan.ClanFile(os.path.join(root, file)) comments = clan_file.get_user_comments() regions = extract_region_comments(comments) try: subregions, silences, skips = group_comments(regions) subrg_time = sum(x[1]-x[0] for x in subregions) silen_time = sum(x[1] - x[0] for x in silences) skips_time = sum(x[1] - x[0] for x in skips) results.append([file, clan_file.total_time, subrg_time, silen_time, skips_time]) # print "".join(["{} --- {}\n".format(comment.line.replace("\n", ""), comment.time_onset) for comment in regions]) except Exception as e: msg, regions = e.args print msg fail_count += 1 err_out.write("{}\n\n".format(file)) err_out.writelines(["{} --- {}\n".format(comment.line.replace("\n", ""), comment.time_onset) for comment in regions]) err_out.write("\n\n\n") print "\n\n\nfailed file count: {}".format(fail_count) return results
def process_file(path): results = {} cf = pc.ClanFile(path) pfx = cf.filename[:5] # get all comments that contain subregion and sort by offset subrs = filter(lambda x: "subregion" in x.line, cf.get_user_comments()) subrs.sort(key=lambda x: x.offset) # subregion lines should be in pairs if len(subrs) % 2 != 0: raise Exception for x in subrs: m = subr_regx.findall(x.line) # each subregion line should only have one regex match if len(m) > 1: raise Exception m=m[0] # each subregion index should have two subregion lines, # and the smaller offset of the lines should be the onset of subregion, # larger offset of the lines be the offset if m[0] not in results: results[m[0]] = [x.offset] else: results[m[0]].append(x.offset) # filename, subregion index, onset, offset results = [[pfx, int(key), val[0], val[1]] for key, val in results.items()] regions.extend(results)
def process_file(f): global total_fixed cf = pc.ClanFile(f.file) found = False for line in cf.line_map: if line._has_timestamp: ts = line.timestamp().split("_") if ts[0][-1] == "9" or ts[1][-1] == "9": old_ts = "_".join(ts) if ts[0][-1] == "9": ts[0] = str(int(ts[0]) + 1) if ts[1][-1] == "9": ts[1] = str(int(ts[1]) + 1) new_ts = "_".join(ts) if new_ts in f.times: if not found: print "**{}**".format(os.path.basename(f.file)) found = True line.line = line.line.replace(old_ts, new_ts) total_fixed += 1 # print "\t{}".format(ts) if found: cf.write_to_cha(os.path.join(out_dir, cf.filename)) print "\t\t\t\ttotal fixed so far: {}".format(total_fixed)
def sum_time(filepath): clan_file = pc.ClanFile(filepath) total = 0 for line in clan_file.line_map: if line.is_tier_line: total += line.time_offset - line.time_onset return total
def end_time(filepath): clan_file = pc.ClanFile(filepath) for line in reversed(clan_file.line_map): if not line.is_tier_line: continue else: return line.time_offset print
def walk_tree(start_dir): for root, dirs, files in os.walk(start_dir): for file in files: if file.endswith(".cha"): clan_file = pc.ClanFile(os.path.join(root, file)) annots = clan_file.annotations() if any(x.speaker == "CHI" for x in annots): if copy_to_folder: shutil.copy(os.path.join(root, file), os.path.join(output_dir, file)) list_of_files.append(file) output_csv(list_of_files)
def pull_regions(path): cf = pc.ClanFile(path) comments = cf.get_user_comments() comments.sort(key=lambda x: x.offset) #print comments sequence = [] for cline in comments: line = cline.line if 'subregion' in line: if 'starts' in line: sequence.append(('subregion starts', cline.offset)) if 'ends' in line: sequence.append(('subregion ends', cline.offset)) elif 'extra' in line: if 'begin' in line: sequence.append(('extra starts', cline.offset)) if 'end' in line: sequence.append(('extra ends', cline.offset)) elif 'silence' in line: if 'start' in line: sequence.append(('silence starts', cline.offset)) if 'end' in line: sequence.append(('silence ends', cline.offset)) elif 'skip' in line: if 'begin' in line: sequence.append(('skip starts', cline.offset)) if 'end' in line: sequence.append(('skip ends', cline.offset)) elif 'makeup' in line or 'make-up' in line or 'make up' in line: if 'begin' in line: sequence.append(('makeup starts', cline.offset)) if 'end' in line: sequence.append(('makeup ends', cline.offset)) # if len(sequence)>1 and sequence[-2][1]==cline.offset: # print(bcolors.WARNING + "Special case" + bcolors.ENDC) return sequence, cf
def extract_pi_regions(path): cf = pc.ClanFile(path) begin = [x for x in cf.line_map if "begin personal" in x.line] end = [x for x in cf.line_map if "end personal" in x.line] if len(begin) != len(end): joined = begin + end joined.sort(key=lambda x: x.onset) with open( os.path.join("errors", "{}_pi_errors".format( os.path.basename(path)[:5])), "wb") as out: for x in joined: out.write("{} - {}\n".format(x.line.replace("\n", ""), x.timestamp())) raise Exception("Begin and End comment count mismatch: {}".format( os.path.basename(path))) joined = [(os.path.join(path), x[0].onset, x[1].onset) for x in zip(begin, end)] for x in joined: if not x[1] < x[2]: joined = begin + end joined.sort(key=lambda x: x.onset) with open( os.path.join( "errors", "{}_pi_errors".format(os.path.basename(path)[:5])), "wb") as out: for x in joined: out.write("{} - {}\n".format(x.line.replace("\n", ""), x.timestamp())) raise Exception("End comment precedes Begin: {}".format( os.path.basename(path))) return joined
import pandas as pd import sys import os import pyclan as pc if __name__ == "__main__": problems = pd.read_csv(sys.argv[1]) cha_dir = sys.argv[2] out_dir = sys.argv[3] cha_files = filter(lambda x: x.endswith(".cha"), os.listdir(cha_dir)) files = {} for file in cha_files: files[file[:5]] = os.path.join(cha_dir, file) for pfx, annots in problems.groupby('SubjectNumber'): cf = pc.ClanFile(files[pfx]) coms = [ x for x in cf.get_user_comments() if "subregion" not in x.line and "silence" not in x.line ] with open(os.path.join(out_dir, pfx), 'wb') as out: for com in coms: out.write("{} --- {}\n".format(com.line.replace("\n", ""), com.onset))
def process_single_clan_file(path, output_folder="output/cha_structures"): output_path = Path(output_folder) / (Path(path).name + '.txt') # Delete the old file output_path.unlink(missing_ok=True) file_with_error_, listen_time = None, None print("Checking {}".format(os.path.basename(path))) # Parse the clan file try: clan_file = pyclan.ClanFile(path) except Exception as e: print(BColors.FAIL + "Error opening file: {}".format(path) + BColors.ENDC) print(sys.exc_info()) return file_with_error_, listen_time # Extract sequence of all starts/ends of all regions and subregion positions and ranks region_boundaries, subregions = pull_regions(clan_file=clan_file) # Sort that sequence by timestamp and - in case of collisions - by region rank region_boundaries = sort_list_of_region_boundaries(region_boundaries) # Check for errors error_list, region_map = sequence_missing_repetition_entry_alert( region_boundaries) if error_list: print( BColors.WARNING + "Finished {0} with errors! Listen time cannot be calculated due to missing starts or ends!\nCheck the {0}.txt file for errors!" .format(os.path.basename(path)) + BColors.ENDC) file_with_error_ = (os.path.basename(path), error_list) # Write results to a text file with open(output_path, 'w') as f: # Write the region boundaries f.write('\n'.join([ region_type_and_side + ' ' + str(timestamp) for region_type_and_side, timestamp in region_boundaries ])) f.write('\n' * 3) # Write the list of errors f.write('\n'.join(error_list)) # Write subregion information f.write('\n') f.write('\n'.join(subregions)) # Calculate listen time # If the file with error has a missing start or end error, we cannot correctly process it! So return! for subregion in error_list: if 'missing' in subregion: return file_with_error_, listen_time try: # Checking if the file is a 6 or 7 month old to set the month67 parameter of the function month67 = os.path.basename(path)[3:5] in ['06', '07'] listen_time, processed_region_map = total_listen_time(clan_file, region_map, month67=month67) except Exception as e: return file_with_error_, listen_time # Save processed region map for debugging processed_region_map_path = Path(output_folder) / 'processed' / Path( path).with_suffix('.csv').name processed_region_map_path.parent.mkdir(exist_ok=True, parents=True) region_map_to_df(processed_region_map).to_csv(processed_region_map_path, index=False) # listen_time is dict returned by total_listen_time function in listen_time.py listen_time['filename'] = os.path.basename(path) # Setting the subregions of the listen_time dictionary. positions = [] ranks = [] for subregion in subregions: # subregion is a string like 'Position: 4, Rank: 4' position_string, rank_string = subregion.split(',') position = position_string.split()[1] rank = rank_string.split()[1] positions.append(position) ranks.append(rank) listen_time['subregions'] = subregions listen_time['ranks'] = ranks listen_time['positions'] = positions print("Finished {}".format(os.path.basename(path)) + '\nTotal Listen Time: ' + BColors.OKGREEN + str(ms2hr(listen_time['total_listen_time'])) + BColors.ENDC) print(subregions) return file_with_error_, listen_time
def cha2eaf(inpf, outf): """Converts a given cha file to an eaf file. """ eaf = setup_eaf() cha = pyclan.ClanFile(inpf) participants = {} # Currently not returning anything, but might return a participant database in the future? _process_header(cha, eaf) # Processing the rest of the file after the header. end = cha.get_header()[-1].index current_tier = '' # The plus one below starts the iteration right after the last header line. for line in cha.line_map[end+1:]: # This is a tier line. if line.is_tier_line: # If the tier does not exist, add it first. if line.tier not in eaf.get_tier_names(): eaf.add_tier(line.tier) current_tier = line.tier elif line.is_paus_block_delimiter or line.is_conv_block_delimiter: line.tier = BLOCK line.content = line.line.strip() # pympi does not like onset and offset being the same... if line.onset == 0 and line.offset == 0: line.onset, line.offset = 0, 1 else: line.onset += 1 line.offset -= 1 elif line.is_clan_comment or line.is_user_comment or 'xcom' in line.line: line.parent_tier = line.tier if line.tier in eaf.get_tier_names() else current_tier line.tier, line.content = line.line.strip().split('\t') add_ref_annotation(line, eaf) continue elif line.xdb_line: line.tier = XDB # Usually XDB lines do get parent_tier set from pyclan, but if there are # other lines in between, that does not appear to be the case. So we are # setting it here. if not line.parent_tier: line.parent_tier = current_tier add_ref_annotation(line, eaf) continue elif line.is_end_header: # This is just the end header. It is not necessary to add it. continue else: print(line) print('UNPROCESSED LINE ABOVE') continue last_annotation = (line.tier, line.onset, line.offset, line.content) eaf.add_annotation(*last_annotation) # If no output file option is supplied. if outf == '': outf = path.basename(inpf).replace('.cha', '.eaf') eaf.to_file(outf)
entry = (int(float(m.group(1))), int(m.group(2))) if entry not in subr_comms: subr_comms.append(entry) continue m = silend_rgx2.search(com.line) if m: entry = (int(float(m.group(1))), int(m.group(2))) if entry not in subr_comms: subr_comms.append(entry) continue for prefix, group in grouper.groups(): print prefix if group.clan_sparsecode: sparsecode = pc.ClanFile(group.clan_sparsecode) elif group.clan_final: sparsecode = pc.ClanFile(group.clan_final) elif group.clan_chi_checked: sparsecode = pc.ClanFile(group.clan_chi_checked) elif group.newclan_merged: sparsecode = pc.ClanFile(group.newclan_merged) elif group.newclan_merged_final: sparsecode = pc.ClanFile(group.newclan_merged_final) else: raise Exception() sparsecode.flatten() # sparsecode.write_to_cha("test.cha") lenacha = pc.ClanFile(group.lena_cha)
import pandas as pd import os wav_times = pd.read_csv("wav_times.csv") cha_dir = "data/both_all_ob1fixed" cha_files = [ os.path.join(cha_dir, x) for x in os.listdir(cha_dir) if "lena.cha" not in x and not x.startswith(".") ] cha_times = [] for f in cha_files: key = os.path.basename(f)[:5] print os.path.basename(f) cf = pc.ClanFile(f) for x in reversed(cf.line_map): if x.is_tier_line: ts = int(x.timestamp().split("_")[1]) cha_times.append((key, ts)) break cha_times = pd.DataFrame(cha_times, columns=["file", "sparse_code_time"]) cha_times.to_csv("cha_times.csv", index=False) joined = cha_times.merge(wav_times, on="file") joined["diff"] = (joined["sparse_code_time"] - joined["wav_time"]).abs() joined.to_csv("time_diffs.csv", index=False)
def pull_regions(path): skip_count = None skip_time = None extra_count = None extra_time = None makeup_count = None makeup_time = None silence_time = None sub_start = None silence_start = None skip_start = None makeup_start = None extra_start = None results = {} issues = [] cf = pc.ClanFile(path) pfx = cf.filename[:5] comments = cf.get_user_comments() comments.sort(key = lambda x: x.offset) #print comments for cline in comments: line = cline.line if "subregion" in line: m = subr_regx.findall(line) if len(m) > 1: issues.append([cline.index, line, "subregion comment repeated"]) if "starts" in line: if sub_start is not None: issues.append([sub_start.index, sub_start.line, "subregion without end"]) continue if len(m[0])<2: issues.append([cline.index, line, "subregion without incorrect numbering"]) sub_start = cline skip_count = 0 skip_time = 0 extra_count = 0 extra_time = 0 makeup_count = 0 makeup_time = 0 silence_time = 0 results[m[0]] = [] continue if "ends" in line: if m[0] not in results: issues.append([cline.index, line, "subregion without begin"]) continue results[m[0]] = [skip_count, skip_time, extra_count, extra_time, makeup_count, makeup_time, silence_time] sub_start = None skip_count = None skip_time = None extra_count = None extra_time = None makeup_count = None makeup_time = None silence_time = None if "extra" in line: if extra_count is None: if "begin" in line: issues.append([cline.index, line, "extra region begins outside subregion"]) continue if "end" in line: issues.append([cline.index, line, "extra region ends outside subregion"]) continue issues.append([cline.index, line, "extra region outside subregion"]) continue if "begin" in line: if extra_start is not None: issues.append([extra_start.index, extra_start.line, "extra region without end"]) extra_start = cline continue if "end" in line: if extra_start is None: issues.append([cline.index, cline.line, "extra region without begin"]) continue extra_count += 1 extra_time += cline.offset - extra_start.offset extra_start = None continue if "make up" in line or "make-up" in line or "makeup" in line: if makeup_count is None: if "begin" in line: issues.append([cline.index, line, "makeup region begins outside subregion"]) continue if "end" in line: issues.append([cline.index, line, "makeup region ends outside subregion"]) continue issues.append([cline.index, line, "makeup region outside subregion"]) continue if "begin" in line: if makeup_start is not None: issues.append([makeup_start.index, makeup_start.line, "makeup region without end"]) makeup_start = cline continue if "end" in line: if makeup_start is None: issues.append([cline.index, cline.line, "makeup region without begin"]) continue makeup_count += 1 makeup_time += cline.offset - makeup_start.offset makeup_start = None continue if "skip" in line: if skip_count is None: if "begin" in line: issues.append([cline.index, line, "skip region begins outside subregion"]) continue if "end" in line: issues.append([cline.index, line, "skip region ends outside subregion"]) continue issues.append([cline.index, line, "skip region outside subregion"]) continue if "begin" in line: if skip_start is not None: issues.append([skip_start.index, skip_start.line, "skip region without end"]) skip_start = cline continue if "end" in line: if skip_start is None: issues.append([cline.index, cline.line, "skip region without begin"]) continue skip_count += 1 skip_time += cline.offset - skip_start.offset skip_start = None continue if "silence" in line: if silence_time is None: if "start" in line: issues.append([cline.index, line, "silence region starts outside subregion"]) continue if "end" in line: issues.append([cline.index, line, "silence region ends outside subregion"]) continue issues.append([cline.index, line, "silence region outside subregion"]) continue if "start" in line: if silence_start is not None: issues.append([silence_start.index, silence_start.line, "silence region without end"]) silence_start = cline continue if "end" in line: if silence_start is None: issues.append([cline.index, cline.line, "silence region without begin"]) continue silence_time += cline.offset - silence_start.offset silence_start = None continue return issues, results
input_dir = "../collect/all_cha" # input_dir = "problems" output_dir = "audio_bl_out2" header = ["tier", "word", "utterance_type", "object_present", "speaker", "timestamp", "pho", "basic_level"] def output_bl(f, annots): with open("{}_audio_sparse_code_processed.csv".format(os.path.join(output_dir, f[:5])), "wb") as out: writer = csv.writer(out) writer.writerow(header) for x in annots: writer.writerow([x.tier, x.word, x.utt_type, x.present, x.speaker, x.timestamp(), x.pho_annot.split("_")[0], ""]) for root, dirs, files in os.walk(input_dir): for file in files: if file.endswith(".cha"): print file try: cf = pc.ClanFile(os.path.join(root, file)) cf.annotate() cf.assign_pho() annots = [x for x in cf.annotations() if x.speaker == "CHI"] if annots: output_bl(file, annots) except Exception as e: print e.__repr__()
if __name__ == "__main__": start_dir = sys.argv[1] output_dir = sys.argv[2] for root, dirs, files in os.walk(start_dir): cha_files = [file for file in files if file.endswith(".cha")] if len(cha_files) == 1: cha_file = cha_files[0] filepath = os.path.join(root, cha_file) csv_path = os.path.join(output_dir, cha_file.replace(".cha", ".csv")) new_cha_path = os.path.join( output_dir, cha_file.replace(".cha", "_idslabel.cha")) clan_file = pyclan.ClanFile(filepath) random_blockrange = clan_file.block_index random.shuffle(random_blockrange) selected_blocks = [] scrub_tiers = clan_file.get_tiers("SCR") scrub_intervals = [] if len(scrub_tiers) > 0: for interval in scrub_tiers.line_map: scrub_intervals.append( [interval.time_onset, interval.time_offset]) for block_num in random_blockrange: block = clan_file.get_conv_block(block_num)
import pyclan as pc clan_file = pc.ClanFile("../sample_data/44_17_coderSD_final.cha") results = clan_file.get_with_speaker("CHI") for x in results: line = pc.ClanLine(index=x.index + 1, line="%pho:\t\n") clan_file.insert_line(line, x.index + 1) print clan_file.write_to_cha("44_17_with_pho.cha") # clan_file.replace_comments(["multi-word", "MWU", "mwu"], "this is a test") # clan_file.write_to_cha("44_17_new.cha") # print clan_file.total_time