Example #1
0
def inspect_discourse_ilg(path, number=None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as an interlinear gloss text file

    Parameters
    ----------
    path : str
        Full path to text file
    number : int, optional
        Number of lines per gloss, if not supplied, it is auto-detected

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']
    lines = {}
    if os.path.isdir(path):
        numbers = {}
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                p = os.path.join(root, filename)
                lines[p] = text_to_lines(p)
                numbers[p] = calculate_lines_per_gloss(lines[p])
        number = most_frequent_value(numbers)
    else:
        lines[path] = text_to_lines(path)
        number = calculate_lines_per_gloss(lines[path])
        p = path
    annotation_types = []
    for i in range(number):
        name = 'Line {}'.format(i + 1)
        if i == 0:
            att = Attribute('spelling', 'spelling', 'Spelling')
            a = AnnotationType(name,
                               None,
                               None,
                               anchor=True,
                               token=False,
                               attribute=att)
        else:
            labels = lines[p][i][1]
            cat = Attribute.guess_type(labels, trans_delimiters)
            att = Attribute(Attribute.sanitize_name(name), cat, name)
            a = AnnotationType(name,
                               None,
                               annotation_types[0].name,
                               token=False,
                               attribute=att)
            if cat == 'tier' and a.trans_delimiter is None:
                for l in labels:
                    for delim in trans_delimiters:
                        if delim in l:
                            a.trans_delimiter = delim
                            break
                    if a.trans_delimiter is not None:
                        break
        a.add(lines[p][i][1], save=False)
        annotation_types.append(a)
    for k, v in lines.items():
        if k == p:
            continue
        for i in range(number):
            labels = lines[k][i][1]
            annotation_types[i].add(labels, save=False)

    return annotation_types
Example #2
0
def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.', ' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root, filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name,
                                       base,
                                       None,
                                       anchor=True,
                                       token=False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       base=True,
                                       token=True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat,
                                    ti.name)
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       token=False,
                                       attribute=att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save=False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise (PCTError(
                    "The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save=False)

    return anno_types
Example #3
0
def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.',' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root,filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name, base, None, anchor = True, token = False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name, None, anchor, base = True, token = True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name)
                    a = AnnotationType(ti.name, None, anchor, token = False, attribute = att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save = False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise(PCTError("The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save = False)

    return anno_types
def inspect_discourse_ilg(path, number = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as an interlinear gloss text file

    Parameters
    ----------
    path : str
        Full path to text file
    number : int, optional
        Number of lines per gloss, if not supplied, it is auto-detected

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']
    lines = {}
    if os.path.isdir(path):
        numbers = {}
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                p = os.path.join(root, filename)
                lines[p] = text_to_lines(p)
                numbers[p] = calculate_lines_per_gloss(lines[p])
        number = most_frequent_value(numbers)
    else:
        lines[path] = text_to_lines(path)
        number = calculate_lines_per_gloss(lines[path])
        p = path
    annotation_types = []
    for i in range(number):
        name = 'Line {}'.format(i+1)
        if i == 0:
            att = Attribute('spelling','spelling','Spelling')
            a = AnnotationType(name, None, None, anchor = True, token = False, attribute = att)
        else:
            labels = lines[p][i][1]
            cat = Attribute.guess_type(labels, trans_delimiters)
            att = Attribute(Attribute.sanitize_name(name), cat, name)
            a = AnnotationType(name, None, annotation_types[0].name, token = False, attribute = att)
            if cat == 'tier' and a.trans_delimiter is None:
                for l in labels:
                    for delim in trans_delimiters:
                        if delim in l:
                            a.trans_delimiter = delim
                            break
                    if a.trans_delimiter is not None:
                        break
        a.add(lines[p][i][1], save = False)
        annotation_types.append(a)
    for k,v in lines.items():
        if k == p:
            continue
        for i in range(number):
            labels = lines[k][i][1]
            annotation_types[i].add(labels, save = False)

    return annotation_types
Example #5
0
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    with open(path,'r', encoding='utf-8') as f:
        lines = []
        head = f.readline().strip()
        for line in f.readlines():
            lines.append(line.strip())
        #for i in range(num_lines):
        #    line = f.readline()
        #    if not line:
        #        break
        #    lines.append(line)

    best = ''
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}

    for line in lines:
        l = line.strip().split(best)
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    for h in head:
        cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token = False, attribute = att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save = False)
        atts.append(a)

    return atts, best
Example #6
0
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    with open(path,'r', encoding='utf-8') as f:
        lines = []
        head = f.readline().strip()
        for line in f.readlines():
            lines.append(line.strip())
        #for i in range(num_lines):
        #    line = f.readline()
        #    if not line:
        #        break
        #    lines.append(line)

    best = ''
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}

    for line in lines:
        l = line.strip().split(best)
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    for h in head:
        cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token = False, attribute = att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save = False)
        atts.append(a)

    return atts, best