Exemple #1
0
def check_is_empty(annfile):
    '''
    Takes an annotation file and returns True is 'NA' is in the annotation
    '''
    annotation = utils.read_ann(annfile)
    if "NA" in annotation:
        return True
    return False
def parse_ann():
    for subject in subjects:
        print(subject)
        for x in range(24):
            annfile = '/Users/francob/real_ann/{}/{}.ann'.format(subject, x)
            lstfile = '/Users/francob/real_ann/{}/{}.lst'.format(subject, x)
            if (os.path.exists(annfile) and os.path.exists(lstfile)):
                words = [x.rstrip('\n').strip().upper() for x in open(lstfile)]
                with open(annfile, 'rb') as ann:
                    real_hits = [
                        x['word'].upper() for x in utils.read_ann(annfile)
                        if x['word'].upper() in words
                    ]
                print(response)
def parse_all():
    gs_total = []
    ds_total = []
    for subject in subjects:
        print(subject)
        tp = []
        fp = []
        fn = []
        tl = []
        tp2 = []
        fp2 = []
        fn2 = []
        tl2 = []
        for x in range(24):
            annfile = '/Users/francob/real_ann/{}/{}.ann'.format(subject, x)
            lstfile = '/Users/francob/real_ann/{}/{}.lst'.format(subject, x)
            gspkl = '/Users/francob/{}/{}.pkl'.format(subject, x)
            gslst = '/Users/francob/{}/{}.lst'.format(subject, x)
            gs_seen = []
            ds_seen = []
            real_hits = []
            pklfile = '/Users/francob/ram_analyses/ram_analyses/{}/{}.pkl'.format(
                subject, x)

            if os.path.exists(lstfile):
                words = [x.rstrip('\n').strip().upper() for x in open(lstfile)]
                if (os.path.exists(gspkl) and os.path.exists(gslst)):
                    gs_seen = [
                        x.upper() for x in parse(gspkl) if x.upper() in words
                    ]
                    # print('Google Cloud')
                    # print(gs_seen)

                if (os.path.exists(pklfile)):
                    words = [
                        x.rstrip('\n').strip().upper() for x in open(lstfile)
                    ]
                    with open(pklfile, 'rb') as pkl:
                        ds_seen = [
                            x.upper() for x in pickle.load(pkl)
                            if x.upper() in words
                        ]

                if (os.path.exists(annfile) and os.path.exists(lstfile)):
                    with open(annfile, 'rb') as ann:
                        real_hits = [
                            x['word'].upper() for x in utils.read_ann(annfile)
                            if x['word'].upper() in words
                        ]

                    if gs_seen:
                        # print('Real words')
                        # print(real_hits)
                        tp.append(
                            len(set(gs_seen).intersection(set(real_hits))))
                        fp.append(len(set(gs_seen) - set(real_hits)))
                        fn.append(len(set(real_hits) - set(gs_seen)))
                        tl.append(len(set(real_hits)))
                        if set(real_hits) != set(gs_seen):
                            print(subject, x)
                            print(real_hits)
                            print(gs_seen)

                    if ds_seen:

                        tp2.append(
                            len(set(ds_seen).intersection(set(real_hits))))
                        fp2.append(len(set(ds_seen) - set(real_hits)))
                        fn2.append(len(set(real_hits) - set(ds_seen)))
                        tl2.append(len(set(real_hits)))

                    # if nd gs_seen:
                    #     print('Real words')
                    #     print(real_hits)
                    #     print('Deepspeech')
                    #     print(ds_seen)
                    #     print('Google Cloud')
                    #     print(gs_seen)

        gs_total.append([(sum(tl) - sum(fp) - sum(fn)) / max(sum(tl), 1),
                         sum(tp),
                         sum(fp),
                         sum(fn),
                         sum(tl)])
        ds_total.append([(sum(tl2) - sum(fp2) - sum(fn2)) / max(sum(tl2), 1),
                         sum(tp2),
                         sum(fp2),
                         sum(fn2),
                         sum(tl2)])
    print([x[0] for x in gs_total])
    print([x[0] for x in ds_total])

    data = [x[0] for x in gs_total if x[0] > 0]
Exemple #4
0
def main():
    '''
    This is meant to be called from parsync.sh.
    '''

    rec_mod_dirs = find_recently_modified.find_cont_subdir(
        '/home1/maint/parse_files/', 'session')

    for dir in rec_mod_dirs:

        #check if it has already been combined
        with open() as comb_sess:
            comb_sess_list = [x for x in comb_sess]

        if dir not in comb_sess_list:
            chunk_dir = os.path.join(dir, 'chunks/')
            fta_dir = os.path.join(dir, 'files_to_annotate/')

            #if it hasn't been combined, try to combine
            if check_all_exist(fta_dir):

                # in order to keep track of all the files, we look at the .times file - first, the one that shows the chunk files
                time_files = glob.glob(dir + '*.times')
                for tf in time_files:
                    base = os.path.basename(os.path.splitext(tf))
                    with open(tf) as time_list_file:
                        time_list = [
                            x.rstrip('\n').split(' ')[0]
                            for x in time_list_file
                        ]

                    #now, we can find the ann
                    for i, chunk_start in enumerate(time_list):
                        current_chunk_ann_file = os.path.join(
                            chunk_dir, '{}_{}.ann'.format(chunk_start, i))
                        current_chunk_ann = utils.read_ann(
                            current_chunk_ann_file)

                        current_chunk_time_file = os.path.join(
                            chunk_dir, '{}_{}.times'.format(chunk_start, i))

                        #if the chunk file had unknown words then the file exists, otherwise it doesn't
                        if os.path.exists(current_chunk_time_file):
                            current_time_list = [
                                x.rstrip('\n').split(' ')[0]
                                for x in current_chunk_time_file
                            ]
                            current_end_time_list = [
                                x.rstrip('\n').split(' ')[1]
                                for x in current_chunk_time_file
                            ]

                            current_chunk_ann = [
                                x for x in current_chunk_ann if x.time < ft
                            ]

                            #now, we remove the unks and surrounding words in the original ann file
                            unk_replacements = []

                            for i2, fta_start in enumerate(current_time_list):
                                #now, we find the user generated ann
                                fta_ann_file = '{}/{}_{}_{}.ann'.format(
                                    x, i, i2, fta_start)
                                if check_is_empty(fta_ann_file):
                                    pass
                                else:
                                    fta_ann = utils.read_ann(fta_ann_file)
                                    unk_replacements.extend(
                                        utils.add_ann(fta_ann, fta_start))

                            current_chunk_ann.extend(unk_replacements)
                        write_par_file(current_chunk_ann,
                                       os.path.join(dir, base + '.par'))
            else:
                email(
                    r'Some files did not exist in {}, was not automatically combined.',
                    '*****@*****.**')
def alt_main():
    gs_total = []
    fns_dict = defaultdict(int)
    fps_dict = defaultdict(int)
    subject_dict = {}
    for subject in subjects:
        if subject not in spanish:
            print(subject)
            tp = []
            fp = []
            fn = []
            tl = []
            for num in range(24):
                annfile = '/data/eeg/{}/behavioral/FR1/session_0/{}.ann'.format(subject, num)
                lstfile = '/data/eeg/{}/behavioral/FR1/session_0/{}.lst'.format(subject, num)
                pklfile = '/scratch/francob/big_ram_analyses_alternatives/{}/{}.pkl'.format(subject, num)
                gs_seen = []
                ds_seen = []
                real_hits = []
                if os.path.exists(lstfile):
                    raw_words = [x.rstrip('\n').strip().lower() for x in open(lstfile)]
                    modify_dict = {modify(x):x for x in raw_words}

                    if (os.path.exists(pklfile)):
                        gs_seen = [x.lower() for x in parse(pklfile, raw_words, modify_dict) if x.lower() in raw_words]


                    if (os.path.exists(annfile)):
                        with open(annfile, 'rb') as ann:
                            real_hits = [x['word'].lower() for x in utils.read_ann(annfile) if x['word'].lower() in raw_words]


                    if gs_seen:
                        # print('Real words')
                        # print(real_hits)
                        # print('Cloud')
                        # print(gs_seen)

                        temp_tp = len(set(gs_seen).intersection(set(real_hits)))
                        temp_fp = len(set(gs_seen) - set(real_hits))
                        temp_fn = len(set(real_hits) - set(gs_seen))
                        temp_tl = len(set(real_hits))


                        tp.append(temp_tp)
                        fp.append(temp_fp)
                        fn.append(temp_fn)
                        tl.append(temp_tl)

                        temp_fns = [x for x in set(real_hits) - set(gs_seen)]
                        for x in temp_fns:
                            fns_dict[x] += 1

                        temp_fps = [x for x in set(gs_seen) - set(real_hits)]
                        for x in temp_fps:
                            fps_dict[x] += 1

                        if temp_fn + temp_fp > 0:
                            print(subject, num)
                            print('real_hits')
                            print(list(real_hits))
                            print('gs_seen')
                            print(list(gs_seen))
                            # print('raw_phonemes')
                            # print(list(raw_phonemes))

            total_diff = ((sum(tl) - sum(fp) - sum(fn)) / float(max(sum(tl), 1)))
            if sum(tl):
                gs_total.append([total_diff, sum(tp), sum(fp), sum(fn), sum(tl)])
                subject_dict[subject] = total_diff

    diffs = [x[0] for x in gs_total]
    tps = [x[1] for x in gs_total]
    fps = [x[2] for x in gs_total]
    fns = [x[3] for x in gs_total]
    tls = [x[4] for x in gs_total]
    print(fns_dict)
    print(fps_dict)

    print([(x, subject_dict[x]) for x in sorted(subject_dict, key=subject_dict.get, reverse=True)])
    print(diffs)
    # print(tls)
    print((sum(tls) - sum(fps) - sum(fns)) / float(sum(tls)))
    print(sum(fps) / float(sum(tls)))
    print(sum(fps))
    print(sum(fns))
    print(sum(fns) / float(sum(tls)))
    print(np.mean([subject_dict[x] for x in subject_dict]))
    # print(subject_dict)
    location_dict = {}
    for subject in subject_dict:
        if subject[-1] in location_dict:
            location_dict[subject[-1]].append(subject_dict[subject])
        else:
            location_dict[subject[-1]] = [subject_dict[subject]]
    print(location_dict)
    for location in location_dict:
        print(location)
        print(np.mean(location_dict[location]))