Beispiel #1
0
def walk_tree(dir):
    results = []
    fail_count = 0
    with open('errors.txt', 'wb') as err_out:
        for root, dirs, files in os.walk(dir):
            for file in files:
                if file.endswith(".cha"):
                    print "parsing file:  {}".format(file)
                    clan_file = pyclan.ClanFile(os.path.join(root, file))
                    comments = clan_file.get_user_comments()
                    regions = extract_region_comments(comments)
                    try:
                        subregions, silences, skips = group_comments(regions)
                        subrg_time = sum(x[1]-x[0] for x in subregions)
                        silen_time = sum(x[1] - x[0] for x in silences)
                        skips_time = sum(x[1] - x[0] for x in skips)
                        results.append([file, clan_file.total_time, subrg_time, silen_time, skips_time])
                        # print "".join(["{} ---  {}\n".format(comment.line.replace("\n", ""), comment.time_onset) for comment in regions])
                    except Exception as e:
                        msg, regions = e.args
                        print msg
                        fail_count += 1
                        err_out.write("{}\n\n".format(file))
                        err_out.writelines(["{} ---  {}\n".format(comment.line.replace("\n", ""), comment.time_onset) for comment in regions])
                        err_out.write("\n\n\n")

    print "\n\n\nfailed file count:  {}".format(fail_count)
    return results
def process_file(path):
    results = {}
    cf = pc.ClanFile(path)
    pfx = cf.filename[:5]
    # get all comments that contain subregion and sort by offset
    subrs = filter(lambda x: "subregion" in x.line, cf.get_user_comments())
    subrs.sort(key=lambda x: x.offset)
    # subregion lines should be in pairs
    if len(subrs) % 2 != 0:
        raise Exception

    for x in subrs:
        m = subr_regx.findall(x.line)
        # each subregion line should only have one regex match
        if len(m) > 1:
            raise Exception
        m=m[0]
        # each subregion index should have two subregion lines,
        # and the smaller offset of the lines should be the onset of subregion,
        # larger offset of the lines be the offset
        if m[0] not in results:
            results[m[0]] = [x.offset]
        else:
            results[m[0]].append(x.offset)

    # filename, subregion index, onset, offset
    results = [[pfx, int(key), val[0], val[1]] for key, val in results.items()]
    regions.extend(results)
Beispiel #3
0
def process_file(f):
    global total_fixed

    cf = pc.ClanFile(f.file)
    found = False
    for line in cf.line_map:
        if line._has_timestamp:
            ts = line.timestamp().split("_")
            if ts[0][-1] == "9" or ts[1][-1] == "9":
                old_ts = "_".join(ts)
                if ts[0][-1] == "9":
                    ts[0] = str(int(ts[0]) + 1)
                if ts[1][-1] == "9":
                    ts[1] = str(int(ts[1]) + 1)
                new_ts = "_".join(ts)
                if new_ts in f.times:
                    if not found:
                        print "**{}**".format(os.path.basename(f.file))
                    found = True
                    line.line = line.line.replace(old_ts, new_ts)
                    total_fixed += 1
                    # print "\t{}".format(ts)
    if found:
        cf.write_to_cha(os.path.join(out_dir, cf.filename))
        print "\t\t\t\ttotal fixed so far: {}".format(total_fixed)
def sum_time(filepath):
    clan_file = pc.ClanFile(filepath)
    total = 0
    for line in clan_file.line_map:
        if line.is_tier_line:
            total += line.time_offset - line.time_onset

    return total
def end_time(filepath):
    clan_file = pc.ClanFile(filepath)
    for line in reversed(clan_file.line_map):
        if not line.is_tier_line:
            continue
        else:
            return line.time_offset
    print
Beispiel #6
0
def walk_tree(start_dir):
    for root, dirs, files in os.walk(start_dir):
        for file in files:
            if file.endswith(".cha"):
                clan_file = pc.ClanFile(os.path.join(root, file))
                annots = clan_file.annotations()
                if any(x.speaker == "CHI" for x in annots):
                    if copy_to_folder:
                        shutil.copy(os.path.join(root, file),
                                    os.path.join(output_dir, file))
                list_of_files.append(file)

    output_csv(list_of_files)
def pull_regions(path):
    cf = pc.ClanFile(path)

    comments = cf.get_user_comments()
    comments.sort(key=lambda x: x.offset)
    #print comments

    sequence = []
    for cline in comments:
        line = cline.line
        if 'subregion' in line:
            if 'starts' in line:
                sequence.append(('subregion starts', cline.offset))
            if 'ends' in line:
                sequence.append(('subregion ends', cline.offset))
        elif 'extra' in line:
            if 'begin' in line:
                sequence.append(('extra starts', cline.offset))
            if 'end' in line:
                sequence.append(('extra ends', cline.offset))
        elif 'silence' in line:
            if 'start' in line:
                sequence.append(('silence starts', cline.offset))
            if 'end' in line:
                sequence.append(('silence ends', cline.offset))
        elif 'skip' in line:
            if 'begin' in line:
                sequence.append(('skip starts', cline.offset))
            if 'end' in line:
                sequence.append(('skip ends', cline.offset))
        elif 'makeup' in line or 'make-up' in line or 'make up' in line:
            if 'begin' in line:
                sequence.append(('makeup starts', cline.offset))
            if 'end' in line:
                sequence.append(('makeup ends', cline.offset))
        # if len(sequence)>1 and sequence[-2][1]==cline.offset:
        #     print(bcolors.WARNING + "Special case" + bcolors.ENDC)
    return sequence, cf
def extract_pi_regions(path):
    cf = pc.ClanFile(path)
    begin = [x for x in cf.line_map if "begin personal" in x.line]
    end = [x for x in cf.line_map if "end personal" in x.line]

    if len(begin) != len(end):
        joined = begin + end
        joined.sort(key=lambda x: x.onset)
        with open(
                os.path.join("errors", "{}_pi_errors".format(
                    os.path.basename(path)[:5])), "wb") as out:
            for x in joined:
                out.write("{} - {}\n".format(x.line.replace("\n", ""),
                                             x.timestamp()))
        raise Exception("Begin and End comment count mismatch: {}".format(
            os.path.basename(path)))

    joined = [(os.path.join(path), x[0].onset, x[1].onset)
              for x in zip(begin, end)]

    for x in joined:
        if not x[1] < x[2]:
            joined = begin + end
            joined.sort(key=lambda x: x.onset)
            with open(
                    os.path.join(
                        "errors",
                        "{}_pi_errors".format(os.path.basename(path)[:5])),
                    "wb") as out:
                for x in joined:
                    out.write("{} - {}\n".format(x.line.replace("\n", ""),
                                                 x.timestamp()))
            raise Exception("End comment precedes Begin:           {}".format(
                os.path.basename(path)))

    return joined
Beispiel #9
0
import pandas as pd
import sys
import os
import pyclan as pc

if __name__ == "__main__":
    problems = pd.read_csv(sys.argv[1])
    cha_dir = sys.argv[2]
    out_dir = sys.argv[3]
    cha_files = filter(lambda x: x.endswith(".cha"), os.listdir(cha_dir))
    files = {}
    for file in cha_files:
        files[file[:5]] = os.path.join(cha_dir, file)

    for pfx, annots in problems.groupby('SubjectNumber'):
        cf = pc.ClanFile(files[pfx])
        coms = [
            x for x in cf.get_user_comments()
            if "subregion" not in x.line and "silence" not in x.line
        ]
        with open(os.path.join(out_dir, pfx), 'wb') as out:
            for com in coms:
                out.write("{}  ---  {}\n".format(com.line.replace("\n", ""),
                                                 com.onset))
Beispiel #10
0
def process_single_clan_file(path, output_folder="output/cha_structures"):
    output_path = Path(output_folder) / (Path(path).name + '.txt')
    # Delete the old file
    output_path.unlink(missing_ok=True)

    file_with_error_, listen_time = None, None

    print("Checking {}".format(os.path.basename(path)))

    # Parse the clan file
    try:
        clan_file = pyclan.ClanFile(path)
    except Exception as e:
        print(BColors.FAIL + "Error opening file: {}".format(path) +
              BColors.ENDC)
        print(sys.exc_info())
        return file_with_error_, listen_time

    # Extract sequence of all starts/ends of all regions and subregion positions and ranks
    region_boundaries, subregions = pull_regions(clan_file=clan_file)

    # Sort that sequence by timestamp and - in case of collisions - by region rank
    region_boundaries = sort_list_of_region_boundaries(region_boundaries)

    # Check for errors
    error_list, region_map = sequence_missing_repetition_entry_alert(
        region_boundaries)
    if error_list:
        print(
            BColors.WARNING +
            "Finished {0} with errors! Listen time cannot be calculated due to missing starts or ends!\nCheck the {0}.txt file for errors!"
            .format(os.path.basename(path)) + BColors.ENDC)
        file_with_error_ = (os.path.basename(path), error_list)

    # Write results to a text file
    with open(output_path, 'w') as f:
        # Write the region boundaries
        f.write('\n'.join([
            region_type_and_side + '   ' + str(timestamp)
            for region_type_and_side, timestamp in region_boundaries
        ]))
        f.write('\n' * 3)

        # Write the list of errors
        f.write('\n'.join(error_list))

        # Write subregion information
        f.write('\n')
        f.write('\n'.join(subregions))

    # Calculate listen time

    # If the file with error has a missing start or end error, we cannot correctly process it! So return!
    for subregion in error_list:
        if 'missing' in subregion:
            return file_with_error_, listen_time

    try:
        # Checking if the file is a 6 or 7 month old to set the month67 parameter of the function
        month67 = os.path.basename(path)[3:5] in ['06', '07']
        listen_time, processed_region_map = total_listen_time(clan_file,
                                                              region_map,
                                                              month67=month67)
    except Exception as e:
        return file_with_error_, listen_time

    # Save processed region map for debugging
    processed_region_map_path = Path(output_folder) / 'processed' / Path(
        path).with_suffix('.csv').name
    processed_region_map_path.parent.mkdir(exist_ok=True, parents=True)
    region_map_to_df(processed_region_map).to_csv(processed_region_map_path,
                                                  index=False)

    # listen_time is dict returned by total_listen_time function in listen_time.py
    listen_time['filename'] = os.path.basename(path)

    # Setting the subregions of the listen_time dictionary.
    positions = []
    ranks = []
    for subregion in subregions:
        # subregion is a string like 'Position: 4, Rank: 4'
        position_string, rank_string = subregion.split(',')
        position = position_string.split()[1]
        rank = rank_string.split()[1]
        positions.append(position)
        ranks.append(rank)

    listen_time['subregions'] = subregions
    listen_time['ranks'] = ranks
    listen_time['positions'] = positions
    print("Finished {}".format(os.path.basename(path)) +
          '\nTotal Listen Time: ' + BColors.OKGREEN +
          str(ms2hr(listen_time['total_listen_time'])) + BColors.ENDC)
    print(subregions)

    return file_with_error_, listen_time
Beispiel #11
0
def cha2eaf(inpf, outf):
    """Converts a given cha file to an eaf file. """

    eaf = setup_eaf()
    cha = pyclan.ClanFile(inpf)
    participants = {}

    # Currently not returning anything, but might return a participant database in the future?
    _process_header(cha, eaf)

    # Processing the rest of the file after the header.
    end = cha.get_header()[-1].index

    current_tier = ''
    # The plus one below starts the iteration right after the last header line.
    for line in cha.line_map[end+1:]:
        # This is a tier line.
        if line.is_tier_line:
            # If the tier does not exist, add it first. 
            if line.tier not in eaf.get_tier_names():
                eaf.add_tier(line.tier)

            current_tier = line.tier

        elif line.is_paus_block_delimiter or line.is_conv_block_delimiter:
            line.tier = BLOCK
            line.content = line.line.strip()

            # pympi does not like onset and offset being the same...
            if line.onset == 0 and line.offset == 0:
                line.onset, line.offset = 0, 1

            else:
                line.onset += 1
                line.offset -= 1
    
        elif line.is_clan_comment or line.is_user_comment or 'xcom' in line.line:
            line.parent_tier = line.tier if line.tier in eaf.get_tier_names() else current_tier
            line.tier, line.content = line.line.strip().split('\t')
            add_ref_annotation(line, eaf)
            continue

        elif line.xdb_line:
            line.tier = XDB

            # Usually XDB lines do get parent_tier set from pyclan, but if there are 
            # other lines in between, that does not appear to be the case. So we are 
            # setting it here.
            if not line.parent_tier:
                line.parent_tier = current_tier
            add_ref_annotation(line, eaf)
            continue

        elif line.is_end_header:
            # This is just the end header. It is not necessary to add it.
            continue

        else:
            print(line)
            print('UNPROCESSED LINE ABOVE')
            continue


        last_annotation = (line.tier, line.onset, line.offset, line.content)
        eaf.add_annotation(*last_annotation)

    # If no output file option is supplied. 
    if outf == '':
        outf = path.basename(inpf).replace('.cha', '.eaf')

    eaf.to_file(outf)
Beispiel #12
0
                entry = (int(float(m.group(1))), int(m.group(2)))
                if entry not in subr_comms:
                    subr_comms.append(entry)
                continue
            m = silend_rgx2.search(com.line)
            if m:
                entry = (int(float(m.group(1))), int(m.group(2)))
                if entry not in subr_comms:
                    subr_comms.append(entry)
                continue


for prefix, group in grouper.groups():
    print prefix
    if group.clan_sparsecode:
        sparsecode = pc.ClanFile(group.clan_sparsecode)
    elif group.clan_final:
        sparsecode = pc.ClanFile(group.clan_final)
    elif group.clan_chi_checked:
        sparsecode = pc.ClanFile(group.clan_chi_checked)
    elif group.newclan_merged:
        sparsecode = pc.ClanFile(group.newclan_merged)
    elif group.newclan_merged_final:
        sparsecode = pc.ClanFile(group.newclan_merged_final)
    else:
        raise Exception()

    sparsecode.flatten()
    # sparsecode.write_to_cha("test.cha")
    lenacha = pc.ClanFile(group.lena_cha)
Beispiel #13
0
import pandas as pd
import os

wav_times = pd.read_csv("wav_times.csv")
cha_dir = "data/both_all_ob1fixed"
cha_files = [
    os.path.join(cha_dir, x) for x in os.listdir(cha_dir)
    if "lena.cha" not in x and not x.startswith(".")
]

cha_times = []

for f in cha_files:
    key = os.path.basename(f)[:5]
    print os.path.basename(f)
    cf = pc.ClanFile(f)
    for x in reversed(cf.line_map):
        if x.is_tier_line:
            ts = int(x.timestamp().split("_")[1])
            cha_times.append((key, ts))
            break

cha_times = pd.DataFrame(cha_times, columns=["file", "sparse_code_time"])

cha_times.to_csv("cha_times.csv", index=False)

joined = cha_times.merge(wav_times, on="file")

joined["diff"] = (joined["sparse_code_time"] - joined["wav_time"]).abs()

joined.to_csv("time_diffs.csv", index=False)
def pull_regions(path):
    skip_count = None
    skip_time = None
    extra_count = None
    extra_time = None
    makeup_count = None
    makeup_time = None
    silence_time = None
    sub_start = None
    silence_start = None
    skip_start = None
    makeup_start = None
    extra_start = None
    results = {}
    issues = []


    cf = pc.ClanFile(path)
    pfx = cf.filename[:5]

    comments = cf.get_user_comments()
    comments.sort(key = lambda x: x.offset)
    #print comments

    for cline in comments:
        line = cline.line
        if "subregion" in line:
            m = subr_regx.findall(line)
            if len(m) > 1:
                issues.append([cline.index, line, "subregion comment repeated"])
            if "starts" in line:
                if sub_start is not None:
                    issues.append([sub_start.index, sub_start.line, "subregion without end"])
                    continue
                if len(m[0])<2:
                    issues.append([cline.index, line, "subregion without incorrect numbering"])
                sub_start = cline
                skip_count = 0
                skip_time = 0
                extra_count = 0
                extra_time = 0
                makeup_count = 0
                makeup_time = 0
                silence_time = 0
                results[m[0]] = []
                continue
            if "ends" in line:
                if m[0] not in results:
                    issues.append([cline.index, line, "subregion without begin"])
                    continue
                results[m[0]] = [skip_count, skip_time, extra_count, extra_time, makeup_count, makeup_time, silence_time]
                sub_start = None
                skip_count = None
                skip_time = None
                extra_count = None
                extra_time = None
                makeup_count = None
                makeup_time = None
                silence_time = None
        if "extra" in line:
            if extra_count is None:
                if "begin" in line:
                    issues.append([cline.index, line, "extra region begins outside subregion"])
                    continue
                if "end" in line:
                    issues.append([cline.index, line, "extra region ends outside subregion"])
                    continue
                issues.append([cline.index, line, "extra region outside subregion"])
                continue
            if "begin" in line:
                if extra_start is not None:
                    issues.append([extra_start.index, extra_start.line, "extra region without end"])
                extra_start = cline
                continue
            if "end" in line:
                if extra_start is None:
                    issues.append([cline.index, cline.line, "extra region without begin"])
                    continue
                extra_count += 1
                extra_time += cline.offset - extra_start.offset
                extra_start = None
                continue
        if "make up" in line or "make-up" in line or "makeup" in line:
            if makeup_count is None:
                if "begin" in line:
                    issues.append([cline.index, line, "makeup region begins outside subregion"])
                    continue
                if "end" in line:
                    issues.append([cline.index, line, "makeup region ends outside subregion"])
                    continue
                issues.append([cline.index, line, "makeup region outside subregion"])
                continue
            if "begin" in line:
                if makeup_start is not None:
                    issues.append([makeup_start.index, makeup_start.line, "makeup region without end"])
                makeup_start = cline
                continue
            if "end" in line:
                if makeup_start is None:
                    issues.append([cline.index, cline.line, "makeup region without begin"])
                    continue
                makeup_count += 1
                makeup_time += cline.offset - makeup_start.offset
                makeup_start = None
                continue
        if "skip" in line:
            if skip_count is None:
                if "begin" in line:
                    issues.append([cline.index, line, "skip region begins outside subregion"])
                    continue
                if "end" in line:
                    issues.append([cline.index, line, "skip region ends outside subregion"])
                    continue
                issues.append([cline.index, line, "skip region outside subregion"])
                continue
            if "begin" in line:
                if skip_start is not None:
                    issues.append([skip_start.index, skip_start.line, "skip region without end"])
                skip_start = cline
                continue
            if "end" in line:
                if skip_start is None:
                    issues.append([cline.index, cline.line, "skip region without begin"])
                    continue
                skip_count += 1
                skip_time += cline.offset - skip_start.offset
                skip_start = None
                continue
        if "silence" in line:
            if silence_time is None:
                if "start" in line:
                    issues.append([cline.index, line, "silence region starts outside subregion"])
                    continue
                if "end" in line:
                    issues.append([cline.index, line, "silence region ends outside subregion"])
                    continue
                issues.append([cline.index, line, "silence region outside subregion"])
                continue
            if "start" in line:
                if silence_start is not None:
                    issues.append([silence_start.index, silence_start.line, "silence region without end"])
                silence_start = cline
                continue
            if "end" in line:
                if silence_start is None:
                    issues.append([cline.index, cline.line, "silence region without begin"])
                    continue
                silence_time += cline.offset - silence_start.offset
                silence_start = None
                continue

    return issues, results
Beispiel #15
0

input_dir = "../collect/all_cha"
# input_dir = "problems"
output_dir = "audio_bl_out2"

header = ["tier", "word", "utterance_type",
          "object_present", "speaker",
          "timestamp", "pho", "basic_level"]

def output_bl(f, annots):
    with open("{}_audio_sparse_code_processed.csv".format(os.path.join(output_dir, f[:5])), "wb") as out:
        writer = csv.writer(out)
        writer.writerow(header)
        for x in annots:
            writer.writerow([x.tier, x.word, x.utt_type, x.present, x.speaker,  x.timestamp(), x.pho_annot.split("_")[0], ""])


for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".cha"):
            print file
            try:
                cf = pc.ClanFile(os.path.join(root, file))
                cf.annotate()
                cf.assign_pho()
                annots = [x for x in cf.annotations() if x.speaker == "CHI"]
                if annots:
                    output_bl(file, annots)
            except Exception as e:
                print e.__repr__()
Beispiel #16
0
if __name__ == "__main__":

    start_dir = sys.argv[1]
    output_dir = sys.argv[2]

    for root, dirs, files in os.walk(start_dir):
        cha_files = [file for file in files if file.endswith(".cha")]
        if len(cha_files) == 1:
            cha_file = cha_files[0]
            filepath = os.path.join(root, cha_file)
            csv_path = os.path.join(output_dir,
                                    cha_file.replace(".cha", ".csv"))
            new_cha_path = os.path.join(
                output_dir, cha_file.replace(".cha", "_idslabel.cha"))

            clan_file = pyclan.ClanFile(filepath)

            random_blockrange = clan_file.block_index
            random.shuffle(random_blockrange)

            selected_blocks = []

            scrub_tiers = clan_file.get_tiers("SCR")
            scrub_intervals = []
            if len(scrub_tiers) > 0:
                for interval in scrub_tiers.line_map:
                    scrub_intervals.append(
                        [interval.time_onset, interval.time_offset])

            for block_num in random_blockrange:
                block = clan_file.get_conv_block(block_num)
Beispiel #17
0
import pyclan as pc

clan_file = pc.ClanFile("../sample_data/44_17_coderSD_final.cha")

results = clan_file.get_with_speaker("CHI")

for x in results:
    line = pc.ClanLine(index=x.index + 1, line="%pho:\t\n")
    clan_file.insert_line(line, x.index + 1)

print

clan_file.write_to_cha("44_17_with_pho.cha")

# clan_file.replace_comments(["multi-word", "MWU", "mwu"], "this is a test")

# clan_file.write_to_cha("44_17_new.cha")

# print clan_file.total_time