def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.morph_delimiters = set() self.number_behavior = None self._list = [] self.name = name self.subtype = subtype self.supertype = supertype self.token = token self.base = base self.anchor = anchor self.speaker = speaker self.ignored = False if self.speaker is not None: self.output_name = re.sub("{}\W*".format(self.speaker), "", self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), "tier", name) else: self.attribute = Attribute(Attribute.sanitize_name(name), "spelling", name) else: self.attribute = attribute
def __init__(self, name, subtype, supertype, attribute = None, anchor = False, token = False, base = False, speaker = None): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.morph_delimiters = set() self.number_behavior = None self._list = [] self.name = name self.subtype = subtype self.supertype = supertype self.token = token self.base = base self.anchor = anchor self.speaker = speaker self.ignored = False if self.speaker is not None: self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name) else: self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name) else: self.attribute = attribute
def __init__( self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None, is_default=False, ): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.morph_delimiters = set() self.number_behavior = None self._list = [ ] #This list contains Annotations for spelling and BaseAnnotations for transcriptions self.name = name #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling" #to get the user's preferred name, look self.output_name, or self.attribute self.subtype = subtype self.supertype = supertype self.token = token self.base = base #base is transcription/tier type self.anchor = anchor #anchor is spelling type self.speaker = speaker self.ignored = False self.is_default = is_default if self.speaker is not None: self.output_name = re.sub('{}\W*'.format(self.speaker), '', self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name, is_default=is_default) else: self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name, is_default=is_default) else: self.attribute = attribute
def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None, is_default=False): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.syllable_delimiter = None self.morph_delimiters = set() self.number_behavior = None self.stress_specification = dict() self.tone_specification = dict() self._list = [] #This list contains Annotations for spelling and BaseAnnotations for transcriptions self.name = name #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling" #to get the user's preferred name, look self.output_name, or self.attribute self.subtype = subtype self.supertype = supertype self.token = token self.base = base #base is transcription/tier type self.anchor = anchor #anchor is spelling type self.speaker = speaker self.ignored = False self.is_default = is_default if self.speaker is not None: self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name, is_default=is_default) else: self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name, is_default=is_default) else: self.attribute = attribute
def inspect_discourse_ilg(path, number=None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as an interlinear gloss text file Parameters ---------- path : str Full path to text file number : int, optional Number of lines per gloss, if not supplied, it is auto-detected Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ trans_delimiters = ['.', ';', ','] lines = {} if os.path.isdir(path): numbers = {} for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue p = os.path.join(root, filename) lines[p] = text_to_lines(p) numbers[p] = calculate_lines_per_gloss(lines[p]) number = most_frequent_value(numbers) else: lines[path] = text_to_lines(path) number = calculate_lines_per_gloss(lines[path]) p = path annotation_types = [] for i in range(number): name = 'Line {}'.format(i + 1) if i == 0: att = Attribute('spelling', 'spelling', 'Spelling') a = AnnotationType(name, None, None, anchor=True, token=False, attribute=att) else: labels = lines[p][i][1] cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(name), cat, name) a = AnnotationType(name, None, annotation_types[0].name, token=False, attribute=att) if cat == 'tier' and a.trans_delimiter is None: for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add(lines[p][i][1], save=False) annotation_types.append(a) for k, v in lines.items(): if k == p: continue for i in range(number): labels = lines[k][i][1] annotation_types[i].add(labels, save=False) return annotation_types
def inspect_discourse_textgrid(path): """ Generate a list of AnnotationTypes for a specified TextGrid file Parameters ---------- path : str Full path to TextGrid file Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the TextGrid file """ trans_delimiters = ['.', ' ', ';', ','] textgrids = [] if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.textgrid'): continue textgrids.append(os.path.join(root, filename)) else: textgrids.append(path) anno_types = [] for t in textgrids: tg = load_textgrid(t) spellings, segments, attributes = guess_tiers(tg) if len(segments) == 0: base = None else: base = segments[0] if len(spellings) == 0: anchor = None else: anchor = spellings[0] interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)] if len(anno_types) == 0: for ti in interval_tiers: if ti.name in spellings: a = AnnotationType(ti.name, base, None, anchor=True, token=False) elif ti.name in segments: a = AnnotationType(ti.name, None, anchor, base=True, token=True) else: labels = uniqueLabels(ti) cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name) a = AnnotationType(ti.name, None, anchor, token=False, attribute=att) if cat == 'tier': for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add((x.mark for x in ti), save=False) anno_types.append(a) else: if len(anno_types) != len(interval_tiers): raise (PCTError( "The TextGrids must have the same number of tiers.")) for i, ti in enumerate(interval_tiers): anno_types[i].add((x.mark for x in ti), save=False) return anno_types
def inspect_discourse_textgrid(path): """ Generate a list of AnnotationTypes for a specified TextGrid file Parameters ---------- path : str Full path to TextGrid file Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the TextGrid file """ trans_delimiters = ['.',' ', ';', ','] textgrids = [] if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.textgrid'): continue textgrids.append(os.path.join(root,filename)) else: textgrids.append(path) anno_types = [] for t in textgrids: tg = load_textgrid(t) spellings, segments, attributes = guess_tiers(tg) if len(segments) == 0: base = None else: base = segments[0] if len(spellings) == 0: anchor = None else: anchor = spellings[0] interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)] if len(anno_types) == 0: for ti in interval_tiers: if ti.name in spellings: a = AnnotationType(ti.name, base, None, anchor = True, token = False) elif ti.name in segments: a = AnnotationType(ti.name, None, anchor, base = True, token = True) else: labels = uniqueLabels(ti) cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name) a = AnnotationType(ti.name, None, anchor, token = False, attribute = att) if cat == 'tier': for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add((x.mark for x in ti), save = False) anno_types.append(a) else: if len(anno_types) != len(interval_tiers): raise(PCTError("The TextGrids must have the same number of tiers.")) for i, ti in enumerate(interval_tiers): anno_types[i].add((x.mark for x in ti), save = False) return anno_types
def inspect_discourse_ilg(path, number = None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as an interlinear gloss text file Parameters ---------- path : str Full path to text file number : int, optional Number of lines per gloss, if not supplied, it is auto-detected Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ trans_delimiters = ['.', ';', ','] lines = {} if os.path.isdir(path): numbers = {} for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue p = os.path.join(root, filename) lines[p] = text_to_lines(p) numbers[p] = calculate_lines_per_gloss(lines[p]) number = most_frequent_value(numbers) else: lines[path] = text_to_lines(path) number = calculate_lines_per_gloss(lines[path]) p = path annotation_types = [] for i in range(number): name = 'Line {}'.format(i+1) if i == 0: att = Attribute('spelling','spelling','Spelling') a = AnnotationType(name, None, None, anchor = True, token = False, attribute = att) else: labels = lines[p][i][1] cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(name), cat, name) a = AnnotationType(name, None, annotation_types[0].name, token = False, attribute = att) if cat == 'tier' and a.trans_delimiter is None: for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add(lines[p][i][1], save = False) annotation_types.append(a) for k,v in lines.items(): if k == p: continue for i in range(number): labels = lines[k][i][1] annotation_types[i].add(labels, save = False) return annotation_types
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as a column-delimited file Parameters ---------- path : str Full path to text file num_lines: int, optional The number of lines to parse from the file coldelim: str, optional A prespecified column delimiter to use, will autodetect if not supplied transdelim : list, optional A prespecfied set of transcription delimiters to look for, will autodetect if not supplied Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ if coldelim is not None: common_delimiters = [coldelim] else: common_delimiters = [',','\t',':','|'] if transdelim is not None: trans_delimiters = [transdelim] else: trans_delimiters = ['.',' ', ';', ','] with open(path,'r', encoding='utf-8') as f: lines = [] head = f.readline().strip() for line in f.readlines(): lines.append(line.strip()) #for i in range(num_lines): # line = f.readline() # if not line: # break # lines.append(line) best = '' num = 1 for d in common_delimiters: trial = len(head.split(d)) if trial > num: num = trial best = d if best == '': raise(DelimiterError('The column delimiter specified did not create multiple columns.')) head = head.split(best) vals = {h: list() for h in head} for line in lines: l = line.strip().split(best) if len(l) != len(head): raise(PCTError('{}, {}'.format(l,head))) for i in range(len(head)): vals[head[i]].append(l[i]) atts = list() for h in head: cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters) att = Attribute(Attribute.sanitize_name(h), cat, h) a = AnnotationType(h, None, None, token = False, attribute = att) if cat == 'tier': for t in trans_delimiters: if t in vals[h][0]: a.trans_delimiter = t break a.add(vals[h], save = False) atts.append(a) return atts, best