def check_is_empty(annfile): ''' Takes an annotation file and returns True is 'NA' is in the annotation ''' annotation = utils.read_ann(annfile) if "NA" in annotation: return True return False
def parse_ann(): for subject in subjects: print(subject) for x in range(24): annfile = '/Users/francob/real_ann/{}/{}.ann'.format(subject, x) lstfile = '/Users/francob/real_ann/{}/{}.lst'.format(subject, x) if (os.path.exists(annfile) and os.path.exists(lstfile)): words = [x.rstrip('\n').strip().upper() for x in open(lstfile)] with open(annfile, 'rb') as ann: real_hits = [ x['word'].upper() for x in utils.read_ann(annfile) if x['word'].upper() in words ] print(response)
def parse_all(): gs_total = [] ds_total = [] for subject in subjects: print(subject) tp = [] fp = [] fn = [] tl = [] tp2 = [] fp2 = [] fn2 = [] tl2 = [] for x in range(24): annfile = '/Users/francob/real_ann/{}/{}.ann'.format(subject, x) lstfile = '/Users/francob/real_ann/{}/{}.lst'.format(subject, x) gspkl = '/Users/francob/{}/{}.pkl'.format(subject, x) gslst = '/Users/francob/{}/{}.lst'.format(subject, x) gs_seen = [] ds_seen = [] real_hits = [] pklfile = '/Users/francob/ram_analyses/ram_analyses/{}/{}.pkl'.format( subject, x) if os.path.exists(lstfile): words = [x.rstrip('\n').strip().upper() for x in open(lstfile)] if (os.path.exists(gspkl) and os.path.exists(gslst)): gs_seen = [ x.upper() for x in parse(gspkl) if x.upper() in words ] # print('Google Cloud') # print(gs_seen) if (os.path.exists(pklfile)): words = [ x.rstrip('\n').strip().upper() for x in open(lstfile) ] with open(pklfile, 'rb') as pkl: ds_seen = [ x.upper() for x in pickle.load(pkl) if x.upper() in words ] if (os.path.exists(annfile) and os.path.exists(lstfile)): with open(annfile, 'rb') as ann: real_hits = [ x['word'].upper() for x in utils.read_ann(annfile) if x['word'].upper() in words ] if gs_seen: # print('Real words') # print(real_hits) tp.append( len(set(gs_seen).intersection(set(real_hits)))) fp.append(len(set(gs_seen) - set(real_hits))) fn.append(len(set(real_hits) - set(gs_seen))) tl.append(len(set(real_hits))) if set(real_hits) != set(gs_seen): print(subject, x) print(real_hits) print(gs_seen) if ds_seen: tp2.append( len(set(ds_seen).intersection(set(real_hits)))) fp2.append(len(set(ds_seen) - set(real_hits))) fn2.append(len(set(real_hits) - set(ds_seen))) tl2.append(len(set(real_hits))) # if nd gs_seen: # print('Real words') # print(real_hits) # print('Deepspeech') # print(ds_seen) # print('Google Cloud') # print(gs_seen) gs_total.append([(sum(tl) - sum(fp) - sum(fn)) / max(sum(tl), 1), sum(tp), sum(fp), sum(fn), sum(tl)]) ds_total.append([(sum(tl2) - sum(fp2) - sum(fn2)) / max(sum(tl2), 1), sum(tp2), sum(fp2), sum(fn2), sum(tl2)]) print([x[0] for x in gs_total]) print([x[0] for x in ds_total]) data = [x[0] for x in gs_total if x[0] > 0]
def main(): ''' This is meant to be called from parsync.sh. ''' rec_mod_dirs = find_recently_modified.find_cont_subdir( '/home1/maint/parse_files/', 'session') for dir in rec_mod_dirs: #check if it has already been combined with open() as comb_sess: comb_sess_list = [x for x in comb_sess] if dir not in comb_sess_list: chunk_dir = os.path.join(dir, 'chunks/') fta_dir = os.path.join(dir, 'files_to_annotate/') #if it hasn't been combined, try to combine if check_all_exist(fta_dir): # in order to keep track of all the files, we look at the .times file - first, the one that shows the chunk files time_files = glob.glob(dir + '*.times') for tf in time_files: base = os.path.basename(os.path.splitext(tf)) with open(tf) as time_list_file: time_list = [ x.rstrip('\n').split(' ')[0] for x in time_list_file ] #now, we can find the ann for i, chunk_start in enumerate(time_list): current_chunk_ann_file = os.path.join( chunk_dir, '{}_{}.ann'.format(chunk_start, i)) current_chunk_ann = utils.read_ann( current_chunk_ann_file) current_chunk_time_file = os.path.join( chunk_dir, '{}_{}.times'.format(chunk_start, i)) #if the chunk file had unknown words then the file exists, otherwise it doesn't if os.path.exists(current_chunk_time_file): current_time_list = [ x.rstrip('\n').split(' ')[0] for x in current_chunk_time_file ] current_end_time_list = [ x.rstrip('\n').split(' ')[1] for x in current_chunk_time_file ] current_chunk_ann = [ x for x in current_chunk_ann if x.time < ft ] #now, we remove the unks and surrounding words in the original ann file unk_replacements = [] for i2, fta_start in enumerate(current_time_list): #now, we find the user generated ann fta_ann_file = '{}/{}_{}_{}.ann'.format( x, i, i2, fta_start) if check_is_empty(fta_ann_file): pass else: fta_ann = utils.read_ann(fta_ann_file) unk_replacements.extend( utils.add_ann(fta_ann, fta_start)) current_chunk_ann.extend(unk_replacements) write_par_file(current_chunk_ann, os.path.join(dir, base + '.par')) else: email( r'Some files did not exist in {}, was not automatically combined.', '*****@*****.**')
def alt_main(): gs_total = [] fns_dict = defaultdict(int) fps_dict = defaultdict(int) subject_dict = {} for subject in subjects: if subject not in spanish: print(subject) tp = [] fp = [] fn = [] tl = [] for num in range(24): annfile = '/data/eeg/{}/behavioral/FR1/session_0/{}.ann'.format(subject, num) lstfile = '/data/eeg/{}/behavioral/FR1/session_0/{}.lst'.format(subject, num) pklfile = '/scratch/francob/big_ram_analyses_alternatives/{}/{}.pkl'.format(subject, num) gs_seen = [] ds_seen = [] real_hits = [] if os.path.exists(lstfile): raw_words = [x.rstrip('\n').strip().lower() for x in open(lstfile)] modify_dict = {modify(x):x for x in raw_words} if (os.path.exists(pklfile)): gs_seen = [x.lower() for x in parse(pklfile, raw_words, modify_dict) if x.lower() in raw_words] if (os.path.exists(annfile)): with open(annfile, 'rb') as ann: real_hits = [x['word'].lower() for x in utils.read_ann(annfile) if x['word'].lower() in raw_words] if gs_seen: # print('Real words') # print(real_hits) # print('Cloud') # print(gs_seen) temp_tp = len(set(gs_seen).intersection(set(real_hits))) temp_fp = len(set(gs_seen) - set(real_hits)) temp_fn = len(set(real_hits) - set(gs_seen)) temp_tl = len(set(real_hits)) tp.append(temp_tp) fp.append(temp_fp) fn.append(temp_fn) tl.append(temp_tl) temp_fns = [x for x in set(real_hits) - set(gs_seen)] for x in temp_fns: fns_dict[x] += 1 temp_fps = [x for x in set(gs_seen) - set(real_hits)] for x in temp_fps: fps_dict[x] += 1 if temp_fn + temp_fp > 0: print(subject, num) print('real_hits') print(list(real_hits)) print('gs_seen') print(list(gs_seen)) # print('raw_phonemes') # print(list(raw_phonemes)) total_diff = ((sum(tl) - sum(fp) - sum(fn)) / float(max(sum(tl), 1))) if sum(tl): gs_total.append([total_diff, sum(tp), sum(fp), sum(fn), sum(tl)]) subject_dict[subject] = total_diff diffs = [x[0] for x in gs_total] tps = [x[1] for x in gs_total] fps = [x[2] for x in gs_total] fns = [x[3] for x in gs_total] tls = [x[4] for x in gs_total] print(fns_dict) print(fps_dict) print([(x, subject_dict[x]) for x in sorted(subject_dict, key=subject_dict.get, reverse=True)]) print(diffs) # print(tls) print((sum(tls) - sum(fps) - sum(fns)) / float(sum(tls))) print(sum(fps) / float(sum(tls))) print(sum(fps)) print(sum(fns)) print(sum(fns) / float(sum(tls))) print(np.mean([subject_dict[x] for x in subject_dict])) # print(subject_dict) location_dict = {} for subject in subject_dict: if subject[-1] in location_dict: location_dict[subject[-1]].append(subject_dict[subject]) else: location_dict[subject[-1]] = [subject_dict[subject]] print(location_dict) for location in location_dict: print(location) print(np.mean(location_dict[location]))