Python Attribute.sanitize_nameの例、corpustools.corpus.classes.Attribute.sanitize_name Pythonの例

コード例 #1

0

ファイルを表示

ファイル: helper.py プロジェクト: mmcauliffe/CorpusTools

 def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None):
     self.characters = set()
     self.ignored_characters = set()
     self.digraphs = set()
     self.trans_delimiter = None
     self.morph_delimiters = set()
     self.number_behavior = None
     self._list = []
     self.name = name
     self.subtype = subtype
     self.supertype = supertype
     self.token = token
     self.base = base
     self.anchor = anchor
     self.speaker = speaker
     self.ignored = False
     if self.speaker is not None:
         self.output_name = re.sub("{}\W*".format(self.speaker), "", self.name)
     else:
         self.output_name = self.name
     if attribute is None:
         if base:
             self.attribute = Attribute(Attribute.sanitize_name(name), "tier", name)
         else:
             self.attribute = Attribute(Attribute.sanitize_name(name), "spelling", name)
     else:
         self.attribute = attribute

コード例 #2

0

ファイルを表示

 def __init__(self, name, subtype, supertype, attribute = None, anchor = False,
                 token = False, base = False, speaker = None):
     self.characters = set()
     self.ignored_characters = set()
     self.digraphs = set()
     self.trans_delimiter = None
     self.morph_delimiters = set()
     self.number_behavior = None
     self._list = []
     self.name = name
     self.subtype = subtype
     self.supertype = supertype
     self.token = token
     self.base = base
     self.anchor = anchor
     self.speaker = speaker
     self.ignored = False
     if self.speaker is not None:
         self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name)
     else:
         self.output_name = self.name
     if attribute is None:
         if base:
             self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name)
         else:
             self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name)
     else:
         self.attribute = attribute

コード例 #3

0

ファイルを表示

 def __init__(
     self,
     name,
     subtype,
     supertype,
     attribute=None,
     anchor=False,
     token=False,
     base=False,
     speaker=None,
     is_default=False,
 ):
     self.characters = set()
     self.ignored_characters = set()
     self.digraphs = set()
     self.trans_delimiter = None
     self.morph_delimiters = set()
     self.number_behavior = None
     self._list = [
     ]  #This list contains Annotations for spelling and BaseAnnotations for transcriptions
     self.name = name
     #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than
     #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling"
     #to get the user's preferred name, look self.output_name, or self.attribute
     self.subtype = subtype
     self.supertype = supertype
     self.token = token
     self.base = base  #base is transcription/tier type
     self.anchor = anchor  #anchor is spelling type
     self.speaker = speaker
     self.ignored = False
     self.is_default = is_default
     if self.speaker is not None:
         self.output_name = re.sub('{}\W*'.format(self.speaker), '',
                                   self.name)
     else:
         self.output_name = self.name
     if attribute is None:
         if base:
             self.attribute = Attribute(Attribute.sanitize_name(name),
                                        'tier',
                                        name,
                                        is_default=is_default)
         else:
             self.attribute = Attribute(Attribute.sanitize_name(name),
                                        'spelling',
                                        name,
                                        is_default=is_default)
     else:
         self.attribute = attribute

コード例 #4

0

ファイルを表示

ファイル: helper.py プロジェクト: PhonologicalCorpusTools/CorpusTools

    def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None,
                 is_default=False):
        self.characters = set()
        self.ignored_characters = set()
        self.digraphs = set()
        self.trans_delimiter = None
        self.syllable_delimiter = None
        self.morph_delimiters = set()
        self.number_behavior = None

        self.stress_specification = dict()
        self.tone_specification = dict()

        self._list = []  #This list contains Annotations for spelling and BaseAnnotations for transcriptions
        self.name = name
        #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than
        #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling"
        #to get the user's preferred name, look self.output_name, or self.attribute
        self.subtype = subtype
        self.supertype = supertype
        self.token = token
        self.base = base #base is transcription/tier type
        self.anchor = anchor #anchor is spelling type
        self.speaker = speaker
        self.ignored = False
        self.is_default = is_default
        if self.speaker is not None:
            self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name)
        else:
            self.output_name = self.name
        if attribute is None:
            if base:
                self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name, is_default=is_default)
            else:
                self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name, is_default=is_default)
        else:
            self.attribute = attribute

コード例 #5

0

ファイルを表示

def inspect_discourse_ilg(path, number=None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as an interlinear gloss text file

    Parameters
    ----------
    path : str
        Full path to text file
    number : int, optional
        Number of lines per gloss, if not supplied, it is auto-detected

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']
    lines = {}
    if os.path.isdir(path):
        numbers = {}
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                p = os.path.join(root, filename)
                lines[p] = text_to_lines(p)
                numbers[p] = calculate_lines_per_gloss(lines[p])
        number = most_frequent_value(numbers)
    else:
        lines[path] = text_to_lines(path)
        number = calculate_lines_per_gloss(lines[path])
        p = path
    annotation_types = []
    for i in range(number):
        name = 'Line {}'.format(i + 1)
        if i == 0:
            att = Attribute('spelling', 'spelling', 'Spelling')
            a = AnnotationType(name,
                               None,
                               None,
                               anchor=True,
                               token=False,
                               attribute=att)
        else:
            labels = lines[p][i][1]
            cat = Attribute.guess_type(labels, trans_delimiters)
            att = Attribute(Attribute.sanitize_name(name), cat, name)
            a = AnnotationType(name,
                               None,
                               annotation_types[0].name,
                               token=False,
                               attribute=att)
            if cat == 'tier' and a.trans_delimiter is None:
                for l in labels:
                    for delim in trans_delimiters:
                        if delim in l:
                            a.trans_delimiter = delim
                            break
                    if a.trans_delimiter is not None:
                        break
        a.add(lines[p][i][1], save=False)
        annotation_types.append(a)
    for k, v in lines.items():
        if k == p:
            continue
        for i in range(number):
            labels = lines[k][i][1]
            annotation_types[i].add(labels, save=False)

    return annotation_types

コード例 #6

0

ファイルを表示

ファイル: textgrid.py プロジェクト: YurikaAonuki/CorpusTools

def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.', ' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root, filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name,
                                       base,
                                       None,
                                       anchor=True,
                                       token=False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       base=True,
                                       token=True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat,
                                    ti.name)
                    a = AnnotationType(ti.name,
                                       None,
                                       anchor,
                                       token=False,
                                       attribute=att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save=False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise (PCTError(
                    "The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save=False)

    return anno_types

コード例 #7

0

ファイルを表示

ファイル: textgrid.py プロジェクト: FieldDB/CorpusTools

def inspect_discourse_textgrid(path):
    """
    Generate a list of AnnotationTypes for a specified TextGrid file

    Parameters
    ----------
    path : str
        Full path to TextGrid file

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the TextGrid file
    """
    trans_delimiters = ['.',' ', ';', ',']
    textgrids = []
    if os.path.isdir(path):
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.textgrid'):
                    continue
                textgrids.append(os.path.join(root,filename))
    else:
        textgrids.append(path)
    anno_types = []
    for t in textgrids:
        tg = load_textgrid(t)
        spellings, segments, attributes = guess_tiers(tg)
        if len(segments) == 0:
            base = None
        else:
            base = segments[0]
        if len(spellings) == 0:
            anchor = None
        else:
            anchor = spellings[0]
        interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)]
        if len(anno_types) == 0:
            for ti in interval_tiers:
                if ti.name in spellings:
                    a = AnnotationType(ti.name, base, None, anchor = True, token = False)
                elif ti.name in segments:
                    a = AnnotationType(ti.name, None, anchor, base = True, token = True)
                else:
                    labels = uniqueLabels(ti)
                    cat = Attribute.guess_type(labels, trans_delimiters)
                    att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name)
                    a = AnnotationType(ti.name, None, anchor, token = False, attribute = att)
                    if cat == 'tier':
                        for l in labels:
                            for delim in trans_delimiters:
                                if delim in l:
                                    a.trans_delimiter = delim
                                    break
                            if a.trans_delimiter is not None:
                                break
                a.add((x.mark for x in ti), save = False)
                anno_types.append(a)
        else:
            if len(anno_types) != len(interval_tiers):
                raise(PCTError("The TextGrids must have the same number of tiers."))
            for i, ti in enumerate(interval_tiers):
                anno_types[i].add((x.mark for x in ti), save = False)

    return anno_types

コード例 #8

0

ファイルを表示

ファイル: text_ilg.py プロジェクト: PhonologicalCorpusTools/CorpusTools

def inspect_discourse_ilg(path, number = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as an interlinear gloss text file

    Parameters
    ----------
    path : str
        Full path to text file
    number : int, optional
        Number of lines per gloss, if not supplied, it is auto-detected

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    trans_delimiters = ['.', ';', ',']
    lines = {}
    if os.path.isdir(path):
        numbers = {}
        for root, subdirs, files in os.walk(path):
            for filename in files:
                if not filename.lower().endswith('.txt'):
                    continue
                p = os.path.join(root, filename)
                lines[p] = text_to_lines(p)
                numbers[p] = calculate_lines_per_gloss(lines[p])
        number = most_frequent_value(numbers)
    else:
        lines[path] = text_to_lines(path)
        number = calculate_lines_per_gloss(lines[path])
        p = path
    annotation_types = []
    for i in range(number):
        name = 'Line {}'.format(i+1)
        if i == 0:
            att = Attribute('spelling','spelling','Spelling')
            a = AnnotationType(name, None, None, anchor = True, token = False, attribute = att)
        else:
            labels = lines[p][i][1]
            cat = Attribute.guess_type(labels, trans_delimiters)
            att = Attribute(Attribute.sanitize_name(name), cat, name)
            a = AnnotationType(name, None, annotation_types[0].name, token = False, attribute = att)
            if cat == 'tier' and a.trans_delimiter is None:
                for l in labels:
                    for delim in trans_delimiters:
                        if delim in l:
                            a.trans_delimiter = delim
                            break
                    if a.trans_delimiter is not None:
                        break
        a.add(lines[p][i][1], save = False)
        annotation_types.append(a)
    for k,v in lines.items():
        if k == p:
            continue
        for i in range(number):
            labels = lines[k][i][1]
            annotation_types[i].add(labels, save = False)

    return annotation_types

コード例 #9

0

ファイルを表示

ファイル: csv.py プロジェクト: FieldDB/CorpusTools

def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    with open(path,'r', encoding='utf-8') as f:
        lines = []
        head = f.readline().strip()
        for line in f.readlines():
            lines.append(line.strip())
        #for i in range(num_lines):
        #    line = f.readline()
        #    if not line:
        #        break
        #    lines.append(line)

    best = ''
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}

    for line in lines:
        l = line.strip().split(best)
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    for h in head:
        cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token = False, attribute = att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save = False)
        atts.append(a)

    return atts, best

コード例 #10

0

ファイルを表示

def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
    """
    Generate a list of AnnotationTypes for a specified text file for parsing
    it as a column-delimited file

    Parameters
    ----------
    path : str
        Full path to text file
    num_lines: int, optional
        The number of lines to parse from the file
    coldelim: str, optional
        A prespecified column delimiter to use, will autodetect if not
        supplied
    transdelim : list, optional
        A prespecfied set of transcription delimiters to look for, will
        autodetect if not supplied

    Returns
    -------
    list of AnnotationTypes
        Autodetected AnnotationTypes for the text file
    """
    if coldelim is not None:
        common_delimiters = [coldelim]
    else:
        common_delimiters = [',','\t',':','|']
    if transdelim is not None:
        trans_delimiters = [transdelim]
    else:
        trans_delimiters = ['.',' ', ';', ',']

    with open(path,'r', encoding='utf-8') as f:
        lines = []
        head = f.readline().strip()
        for line in f.readlines():
            lines.append(line.strip())
        #for i in range(num_lines):
        #    line = f.readline()
        #    if not line:
        #        break
        #    lines.append(line)

    best = ''
    num = 1
    for d in common_delimiters:
        trial = len(head.split(d))
        if trial > num:
            num = trial
            best = d
    if best == '':
        raise(DelimiterError('The column delimiter specified did not create multiple columns.'))

    head = head.split(best)
    vals = {h: list() for h in head}

    for line in lines:
        l = line.strip().split(best)
        if len(l) != len(head):
            raise(PCTError('{}, {}'.format(l,head)))
        for i in range(len(head)):
            vals[head[i]].append(l[i])
    atts = list()
    for h in head:
        cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters)
        att = Attribute(Attribute.sanitize_name(h), cat, h)
        a = AnnotationType(h, None, None, token = False, attribute = att)
        if cat == 'tier':
            for t in trans_delimiters:
                if t in vals[h][0]:
                    a.trans_delimiter = t
                    break
        a.add(vals[h], save = False)
        atts.append(a)

    return atts, best