Exemple #1
0
def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.', ' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root, filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name,
                                       base,
                                       None,
                                       anchor=True,
                                       token=False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       base=True,
                                       token=True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat,
                                    ti.name)
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       token=False,
                                       attribute=att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save=False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise (PCTError(
                    "The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save=False)

    return anno_types
Exemple #2
0
 def __enter__(self):
     self = BaseCorpusContext.__enter__(self)
     if not self.corpus.has_wordtokens:
         raise (PCTError('The corpus specified does not have variants.'))
     return self
Exemple #3
0
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    with open(path,'r', encoding='utf-8-sig') as f:
        lines = []
        head = f.readline().strip()
        for line in f.readlines():
            lines.append(line.strip())

    best = ''
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}

    for line in lines:
        l = line.strip().split(best)
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    for h in head:
        if h in ['Transcription', 'transcription']:
            cat = 'tier'
        else:
            cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token=False, attribute=att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0] or t in vals[h][-1]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save = False)
        atts.append(a)

    return atts, best
def calc_prod(corpus_context,
              envs,
              strict=True,
              all_info=False,
              stop_check=None,
              call_back=None):
    """
    Main function for calculating predictability of distribution for
    two segments over specified environments in a corpus.

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    envs : list of EnvironmentFilter
        List of EnvironmentFilter objects that specify environments
    strict : bool
        If true, exceptions will be raised for non-exhausive environments
        and non-unique environments.  If false, only warnings will be
        shown.  Defaults to True.
    all_info : bool
        If true, all the intermediate numbers for calculating predictability
        of distribution will be returned.  If false, only the final entropy
        will be returned.  Defaults to False.
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    dict
        Keys are the environments specified and values are either a list
        of [entropy, frequency of environment, frequency of seg1, frequency
        of seg2] if all_info is True, or just entropy if all_info is False.
    """
    seg_list = envs[0].middle
    for e in envs:
        if e.middle != seg_list:
            raise (PCTError(
                "Middle segments of all environments must be the same."))

    returned = check_envs(corpus_context, envs, stop_check, call_back)

    if stop_check is not None and stop_check():
        return
    env_matches, miss_envs, overlap_envs = returned
    if miss_envs or overlap_envs:
        if strict:
            raise (ProdError(envs, miss_envs, overlap_envs))

    H_dict = OrderedDict()

    #CALCULATE ENTROPY IN INDIVIDUAL ENVIRONMENTS FIRST
    total_matches = {x: 0 for x in seg_list}
    total_frequency = 0

    if call_back is not None:
        call_back('Calculating predictability of distribution...')
        call_back(0, len(corpus_context))
        cur = 0
    for env in env_matches:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        total_tokens = 0
        matches = {}
        for seg in seg_list:
            matches[seg] = env_matches[env][seg]
            total_matches[seg] += matches[seg]
            total_tokens += matches[seg]
        total_frequency += total_tokens

        if not total_tokens:
            H = 0
        else:
            seg_H = {}
            for seg in seg_list:
                seg_prob = matches[seg] / total_tokens
                seg_H[seg] = log2(seg_prob) * seg_prob if seg_prob > 0 else 0
            H = sum(seg_H.values()) * -1
            if not H:
                H = H + 0  #avoid the -0.0 problem
        H_dict[env] = [H, total_tokens] + [matches[x] for x in seg_list]

    #CALCULATE WEIGHTED ENTROPY LAST
    weighted_H = 0
    for env in env_matches:
        weighted_H += H_dict[env][0] * (
            H_dict[env][1] / total_frequency) if total_frequency > 0 else 0

    try:
        avg_h = sum(total_matches.values()) / total_frequency
    except ZeroDivisionError:
        avg_h = 0.0

    H_dict['AVG'] = [weighted_H, avg_h] + [total_matches[x] for x in seg_list]

    if not all_info:
        for k, v in H_dict.items():
            H_dict[k] = v[0]
    return H_dict