def inspect_discourse_textgrid(path): """ Generate a list of AnnotationTypes for a specified TextGrid file Parameters ---------- path : str Full path to TextGrid file Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the TextGrid file """ trans_delimiters = ['.', ' ', ';', ','] textgrids = [] if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.textgrid'): continue textgrids.append(os.path.join(root, filename)) else: textgrids.append(path) anno_types = [] for t in textgrids: tg = load_textgrid(t) spellings, segments, attributes = guess_tiers(tg) if len(segments) == 0: base = None else: base = segments[0] if len(spellings) == 0: anchor = None else: anchor = spellings[0] interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)] if len(anno_types) == 0: for ti in interval_tiers: if ti.name in spellings: a = AnnotationType(ti.name, base, None, anchor=True, token=False) elif ti.name in segments: a = AnnotationType(ti.name, None, anchor, base=True, token=True) else: labels = uniqueLabels(ti) cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name) a = AnnotationType(ti.name, None, anchor, token=False, attribute=att) if cat == 'tier': for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add((x.mark for x in ti), save=False) anno_types.append(a) else: if len(anno_types) != len(interval_tiers): raise (PCTError( "The TextGrids must have the same number of tiers.")) for i, ti in enumerate(interval_tiers): anno_types[i].add((x.mark for x in ti), save=False) return anno_types
def __enter__(self): self = BaseCorpusContext.__enter__(self) if not self.corpus.has_wordtokens: raise (PCTError('The corpus specified does not have variants.')) return self
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as a column-delimited file Parameters ---------- path : str Full path to text file num_lines: int, optional The number of lines to parse from the file coldelim: str, optional A prespecified column delimiter to use, will autodetect if not supplied transdelim : list, optional A prespecfied set of transcription delimiters to look for, will autodetect if not supplied Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ if coldelim is not None: common_delimiters = [coldelim] else: common_delimiters = [',','\t',':','|'] if transdelim is not None: trans_delimiters = [transdelim] else: trans_delimiters = ['.',' ', ';', ','] with open(path,'r', encoding='utf-8-sig') as f: lines = [] head = f.readline().strip() for line in f.readlines(): lines.append(line.strip()) best = '' num = 1 for d in common_delimiters: trial = len(head.split(d)) if trial > num: num = trial best = d if best == '': raise(DelimiterError('The column delimiter specified did not create multiple columns.')) head = head.split(best) vals = {h: list() for h in head} for line in lines: l = line.strip().split(best) if len(l) != len(head): raise(PCTError('{}, {}'.format(l,head))) for i in range(len(head)): vals[head[i]].append(l[i]) atts = list() for h in head: if h in ['Transcription', 'transcription']: cat = 'tier' else: cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters) att = Attribute(Attribute.sanitize_name(h), cat, h) a = AnnotationType(h, None, None, token=False, attribute=att) if cat == 'tier': for t in trans_delimiters: if t in vals[h][0] or t in vals[h][-1]: a.trans_delimiter = t break a.add(vals[h], save = False) atts.append(a) return atts, best
def calc_prod(corpus_context, envs, strict=True, all_info=False, stop_check=None, call_back=None): """ Main function for calculating predictability of distribution for two segments over specified environments in a corpus. Parameters ---------- corpus_context : CorpusContext Context manager for a corpus envs : list of EnvironmentFilter List of EnvironmentFilter objects that specify environments strict : bool If true, exceptions will be raised for non-exhausive environments and non-unique environments. If false, only warnings will be shown. Defaults to True. all_info : bool If true, all the intermediate numbers for calculating predictability of distribution will be returned. If false, only the final entropy will be returned. Defaults to False. stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- dict Keys are the environments specified and values are either a list of [entropy, frequency of environment, frequency of seg1, frequency of seg2] if all_info is True, or just entropy if all_info is False. """ seg_list = envs[0].middle for e in envs: if e.middle != seg_list: raise (PCTError( "Middle segments of all environments must be the same.")) returned = check_envs(corpus_context, envs, stop_check, call_back) if stop_check is not None and stop_check(): return env_matches, miss_envs, overlap_envs = returned if miss_envs or overlap_envs: if strict: raise (ProdError(envs, miss_envs, overlap_envs)) H_dict = OrderedDict() #CALCULATE ENTROPY IN INDIVIDUAL ENVIRONMENTS FIRST total_matches = {x: 0 for x in seg_list} total_frequency = 0 if call_back is not None: call_back('Calculating predictability of distribution...') call_back(0, len(corpus_context)) cur = 0 for env in env_matches: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) total_tokens = 0 matches = {} for seg in seg_list: matches[seg] = env_matches[env][seg] total_matches[seg] += matches[seg] total_tokens += matches[seg] total_frequency += total_tokens if not total_tokens: H = 0 else: seg_H = {} for seg in seg_list: seg_prob = matches[seg] / total_tokens seg_H[seg] = log2(seg_prob) * seg_prob if seg_prob > 0 else 0 H = sum(seg_H.values()) * -1 if not H: H = H + 0 #avoid the -0.0 problem H_dict[env] = [H, total_tokens] + [matches[x] for x in seg_list] #CALCULATE WEIGHTED ENTROPY LAST weighted_H = 0 for env in env_matches: weighted_H += H_dict[env][0] * ( H_dict[env][1] / total_frequency) if total_frequency > 0 else 0 try: avg_h = sum(total_matches.values()) / total_frequency except ZeroDivisionError: avg_h = 0.0 H_dict['AVG'] = [weighted_H, avg_h] + [total_matches[x] for x in seg_list] if not all_info: for k, v in H_dict.items(): H_dict[k] = v[0] return H_dict