def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.morph_delimiters = set() self.number_behavior = None self._list = [] self.name = name self.subtype = subtype self.supertype = supertype self.token = token self.base = base self.anchor = anchor self.speaker = speaker self.ignored = False if self.speaker is not None: self.output_name = re.sub("{}\W*".format(self.speaker), "", self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), "tier", name) else: self.attribute = Attribute(Attribute.sanitize_name(name), "spelling", name) else: self.attribute = attribute
def __init__(self, name, subtype, supertype, attribute = None, anchor = False, token = False, base = False, speaker = None): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.morph_delimiters = set() self.number_behavior = None self._list = [] self.name = name self.subtype = subtype self.supertype = supertype self.token = token self.base = base self.anchor = anchor self.speaker = speaker self.ignored = False if self.speaker is not None: self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name) else: self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name) else: self.attribute = attribute
def test_ilg_basic(ilg_test_dir): basic_path = os.path.join(ilg_test_dir, 'test_basic.txt') tier_att = Attribute('transcription','tier') tier_att.delimiter = '.' ats = [AnnotationType('spelling', 'transcription', None, token = False, anchor = True), AnnotationType('transcription', None, None, token = False, base = True, attribute = tier_att)] ats[1].trans_delimiter = '.' corpus = load_discourse_ilg('test', basic_path, ats) print(corpus.words) print(corpus.lexicon.words) assert(corpus.lexicon.find('a').frequency == 2)
def __init__( self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None, is_default=False, ): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.morph_delimiters = set() self.number_behavior = None self._list = [ ] #This list contains Annotations for spelling and BaseAnnotations for transcriptions self.name = name #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling" #to get the user's preferred name, look self.output_name, or self.attribute self.subtype = subtype self.supertype = supertype self.token = token self.base = base #base is transcription/tier type self.anchor = anchor #anchor is spelling type self.speaker = speaker self.ignored = False self.is_default = is_default if self.speaker is not None: self.output_name = re.sub('{}\W*'.format(self.speaker), '', self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name, is_default=is_default) else: self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name, is_default=is_default) else: self.attribute = attribute
def test_filter_model(qtbot): model = FilterModel() a = Attribute('test','numeric','Test') f = (a, '__eq__', 0) model.addRow(f) assert(model.data(model.index(0,0),Qt.DisplayRole) == 'Test == 0') model.removeRow(0) assert(len(model.filters) == 0) a = Attribute('test','factor','Test') f = (a, ['a','b','c']) model.addRow(f) assert(model.data(model.index(0,0),Qt.DisplayRole) == 'Test a, b, c')
def test_ilg_data(ilg_test_dir): basic_path = os.path.join(ilg_test_dir, 'test_basic.txt') tier_att = Attribute('transcription', 'tier') tier_att.delimiter = '.' ats = [ AnnotationType('spelling', 'transcription', None, token=False, anchor=True), AnnotationType('transcription', None, None, token=False, base=True, attribute=tier_att) ] ats[1].trans_delimiter = '.' data = ilg_to_data(basic_path, ats) expected_words = [] a = Annotation('a') a.references.append('transcription') a.begins.append(0) a.ends.append(2) expected_words.append(a) a = Annotation('a') a.references.append('transcription') a.begins.append(2) a.ends.append(4) expected_words.append(a) a = Annotation('b') a.references.append('transcription') a.begins.append(4) a.ends.append(6) expected_words.append(a) assert (data['spelling']._list == expected_words) assert (data['transcription']._list == [ BaseAnnotation('a'), BaseAnnotation('b'), BaseAnnotation('a'), BaseAnnotation('b'), BaseAnnotation('c'), BaseAnnotation('d') ])
def accept(self): tierName = self.nameEdit.text() self.attribute = Attribute(tierName.lower().replace(' ', ''), 'tier', tierName) if tierName == '': reply = QMessageBox.critical(self, "Missing information", "Please enter a name for the tier.") return elif self.attribute.name in self.corpus.basic_attributes: reply = QMessageBox.critical( self, "Invalid information", "The name '{}' overlaps with a protected column.".format( tierName)) return elif self.attribute in self.corpus.attributes: msgBox = QMessageBox( QMessageBox.Warning, "Duplicate tiers", "'{}' is already the name of a tier. Overwrite?".format( tierName), QMessageBox.NoButton, self) msgBox.addButton("Overwrite", QMessageBox.AcceptRole) msgBox.addButton("Cancel", QMessageBox.RejectRole) if msgBox.exec_() != QMessageBox.AcceptRole: return #createType = self.createType.currentText() #createList = self.createWidget.value() inClass, notInClass = self.generateClass() if not inClass: reply = QMessageBox.critical( self, "Missing information", "Please specify at least one segment or one feature value") self.segList = inClass QDialog.accept(self)
def accept(self): name = self.nameWidget.text() self.attribute = Attribute(name.lower().replace(' ', ''), 'numeric', name) if name == '': reply = QMessageBox.critical(self, "Missing information", "Please enter a name for the tier.") return elif self.attribute.name in self.corpus.basic_attributes: reply = QMessageBox.critical( self, "Invalid information", "The name '{}' overlaps with a protected column.".format(name)) return elif self.attribute in self.corpus.attributes: msgBox = QMessageBox( QMessageBox.Warning, "Duplicate tiers", "'{}' is already the name of a tier. Overwrite?".format(name), QMessageBox.NoButton, self) msgBox.addButton("Overwrite", QMessageBox.AcceptRole) msgBox.addButton("Cancel", QMessageBox.RejectRole) if msgBox.exec_() != QMessageBox.AcceptRole: return self.sequenceType = self.tierWidget.value() self.segList = self.segmentSelect.value() QDialog.accept(self)
def accept(self): name = self.nameWidget.text() at = self.typeWidget.currentText().lower() dv = self.defaultWidget.text() self.attribute = Attribute(name.lower().replace(' ', ''), at, name) if name == '': reply = QMessageBox.critical(self, "Missing information", "Please enter a name for the tier.") return elif self.attribute.name in self.corpus.basic_attributes: reply = QMessageBox.critical( self, "Invalid information", "The name '{}' overlaps with a protected column.".format(name)) return elif self.attribute in self.corpus.attributes: msgBox = QMessageBox( QMessageBox.Warning, "Duplicate tiers", "'{}' is already the name of a tier. Overwrite?".format(name), QMessageBox.NoButton, self) msgBox.addButton("Overwrite", QMessageBox.AcceptRole) msgBox.addButton("Cancel", QMessageBox.RejectRole) if msgBox.exec_() != QMessageBox.AcceptRole: return if at == 'numeric': try: dv = float(dv) except ValueError: reply = QMessageBox.critical( self, "Invalid information", "The default value for numeric columns must be a number") return self.attribute.default_value = dv QDialog.accept(self)
def accept(self): if self.cvradio.isChecked(): tierName = 'CV skeleton' self.attribute = Attribute('cvskeleton', 'factor', 'CV skeleton') self.segList = self.generateSegList() if tierName == '': reply = QMessageBox.critical(self, "Missing information", "Please enter a name for the tier.") return if self.attribute.name in self.corpus.basic_attributes: reply = QMessageBox.critical( self, "Invalid information", "The name '{}' overlaps with a protected column.".format( tierName)) return elif self.attribute in self.corpus.attributes: msgBox = QMessageBox( QMessageBox.Warning, "Duplicate tiers", "'{}' is already the name of a tier. Overwrite?".format( tierName), QMessageBox.NoButton, self) msgBox.addButton("Overwrite", QMessageBox.AcceptRole) msgBox.addButton("Cancel", QMessageBox.RejectRole) if msgBox.exec_() != QMessageBox.AcceptRole: return QDialog.accept(self)
def test_corpus_model(qtbot, specified_test_corpus, settings): model = CorpusModel(specified_test_corpus, settings) assert(model.headerData(0,Qt.Horizontal,Qt.DisplayRole) == 'Spelling') assert(model.headerData(1,Qt.Horizontal,Qt.DisplayRole) == 'Transcription') assert(model.headerData(2,Qt.Horizontal,Qt.DisplayRole) == 'Frequency') a = Attribute('test', 'spelling','Test2') model.addColumn(a) assert(model.headerData(3,Qt.Horizontal,Qt.DisplayRole) == 'Test2') model.removeAttributes(['Test2']) assert(len(model.columns) == 3) a = Attribute('test','factor','Test') model.addAbstractTier(a, {'C':['t','m']}) assert(model.wordObject(0).test == 'CC') model.removeAttributes(['Test']) a = Attribute('test','numeric','Test') model.addCountColumn(a, 'transcription', ['t','m']) assert(model.wordObject(0).test == 2) model.removeAttributes(['Test']) a = Attribute('test','tier','Test') model.addTier(a, ['t','m']) assert(model.wordObject(0).test == ['t','m']) model.removeAttributes(['Test']) w = model.wordObject(0) assert(w.spelling == 'atema') w = Word(spelling = 'atema', transcription = []) model.replaceWord(0, w) w = model.wordObject(0) assert(w.spelling == 'atema' and w.transcription == []) model.hideNonLexical(True) w = model.wordObject(0) assert(w.spelling != 'atema') model.hideNonLexical(False) w = model.wordObject(0) assert(w.spelling == 'atema')
def inspect_discourse_transcription(path): """ Generate a list of AnnotationTypes for a specified text file for parsing it as a transcribed text Parameters ---------- path : str Full path to text file Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ trans_delimiters = ['.', ';', ','] att = Attribute('transcription', 'tier', 'Transcription') a = AnnotationType('Transcription', None, None, attribute=att, base=True) if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue with open(os.path.join(root, filename), encoding='utf-8-sig', mode='r') as f: for line in f.readlines(): trial = line.strip().split() if a.trans_delimiter is None: for t in trial: for delim in trans_delimiters: if delim in t: a.trans_delimiter = delim break a.add(trial, save=False) else: with open(path, encoding='utf-8-sig', mode='r') as f: for line in f.readlines(): trial = line.strip().split() if a.trans_delimiter is None: for t in trial: for delim in trans_delimiters: if delim in t: a.trans_delimiter = delim break a.add(trial, save=False) annotation_types = [a] return annotation_types
def test_ilg_basic(ilg_test_dir): basic_path = os.path.join(ilg_test_dir, 'test_basic.txt') tier_att = Attribute('transcription', 'tier') tier_att.delimiter = '.' ats = [ AnnotationType('spelling', 'transcription', None, token=False, anchor=True), AnnotationType('transcription', None, None, token=False, base=True, attribute=tier_att) ] ats[1].trans_delimiter = '.' corpus = load_discourse_ilg('test', basic_path, ats) print(corpus.words) print(corpus.lexicon.words) assert (corpus.lexicon.find('a').frequency == 2)
def test_ilg_data(ilg_test_dir): basic_path = os.path.join(ilg_test_dir, 'test_basic.txt') tier_att = Attribute('transcription','tier') tier_att.delimiter = '.' ats = [AnnotationType('spelling', 'transcription', None, token = False, anchor = True), AnnotationType('transcription', None, None, token = False, base = True, attribute = tier_att)] ats[1].trans_delimiter = '.' data = ilg_to_data(basic_path, ats) expected_words = [] a = Annotation('a') a.references.append('transcription') a.begins.append(0) a.ends.append(2) expected_words.append(a) a = Annotation('a') a.references.append('transcription') a.begins.append(2) a.ends.append(4) expected_words.append(a) a = Annotation('b') a.references.append('transcription') a.begins.append(4) a.ends.append(6) expected_words.append(a) assert(data['spelling']._list == expected_words) assert(data['transcription']._list == [BaseAnnotation('a'), BaseAnnotation('b'), BaseAnnotation('a'), BaseAnnotation('b'), BaseAnnotation('c'), BaseAnnotation('d')])
def __init__(self, name, subtype, supertype, attribute=None, anchor=False, token=False, base=False, speaker=None, is_default=False): self.characters = set() self.ignored_characters = set() self.digraphs = set() self.trans_delimiter = None self.syllable_delimiter = None self.morph_delimiters = set() self.number_behavior = None self.stress_specification = dict() self.tone_specification = dict() self._list = [] #This list contains Annotations for spelling and BaseAnnotations for transcriptions self.name = name #This variable name is confusing - it represents something like "Orthography" or "Transcription", rather than #the name that the user would have given to the column, e.g. "canonical_pron" or "Spelling" #to get the user's preferred name, look self.output_name, or self.attribute self.subtype = subtype self.supertype = supertype self.token = token self.base = base #base is transcription/tier type self.anchor = anchor #anchor is spelling type self.speaker = speaker self.ignored = False self.is_default = is_default if self.speaker is not None: self.output_name = re.sub('{}\W*'.format(self.speaker),'',self.name) else: self.output_name = self.name if attribute is None: if base: self.attribute = Attribute(Attribute.sanitize_name(name), 'tier', name, is_default=is_default) else: self.attribute = Attribute(Attribute.sanitize_name(name), 'spelling', name, is_default=is_default) else: self.attribute = attribute
def test_ilg_mismatched(ilg_test_dir): mismatched_path = os.path.join(ilg_test_dir, 'test_mismatched.txt') ats = [ AnnotationType('spelling', 'transcription', None, token=False, anchor=True), AnnotationType('transcription', None, None, token=False, base=True, attribute=Attribute('transcription', 'tier')) ] ats[1].trans_delimiter = '.' with pytest.raises(ILGWordMismatchError): t = load_discourse_ilg('test', mismatched_path, ats)
def inspect_discourse_ilg(path, number=None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as an interlinear gloss text file Parameters ---------- path : str Full path to text file number : int, optional Number of lines per gloss, if not supplied, it is auto-detected Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ trans_delimiters = ['.', ';', ','] lines = {} if os.path.isdir(path): numbers = {} for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue p = os.path.join(root, filename) lines[p] = text_to_lines(p) numbers[p] = calculate_lines_per_gloss(lines[p]) number = most_frequent_value(numbers) else: lines[path] = text_to_lines(path) number = calculate_lines_per_gloss(lines[path]) p = path annotation_types = [] for i in range(number): name = 'Line {}'.format(i + 1) if i == 0: att = Attribute('spelling', 'spelling', 'Spelling') a = AnnotationType(name, None, None, anchor=True, token=False, attribute=att) else: labels = lines[p][i][1] cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(name), cat, name) a = AnnotationType(name, None, annotation_types[0].name, token=False, attribute=att) if cat == 'tier' and a.trans_delimiter is None: for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add(lines[p][i][1], save=False) annotation_types.append(a) for k, v in lines.items(): if k == p: continue for i in range(number): labels = lines[k][i][1] annotation_types[i].add(labels, save=False) return annotation_types
def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, support_corpus=None, ignore_case=False, call_back=None, stop_check=None): curr_word = list() annotations = {at:list() for at in annotation_types} spelling_name, transcription_name = None, None if call_back is not None: call_back('Processing data...') cur = 0 for at in annotation_types: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) if all(isinstance(item, Annotation) for item in at._list): # it's a list of spellings, take each one and add it to the overall annotations list for item in at._list: if item.label: annotations[at].append((item.label, None, None)) elif all(type(item) == BaseAnnotation for item in at._list): #it's a list of transcriptions, with each segment as a BaseAnnotation for item in at._list: if item.begin is not None: begin = item.begin if item.end is None: curr_word.append(item) elif item.end is not None: end = item.end curr_word.append(item) curr_word = Transcription(curr_word) annotations[at].append((curr_word, begin, end)) curr_word = list() else: print(at._list) raise TypeError("AnnotationType._list cannot contain a mix of Annotations and BaseAnnotations") if support_corpus is not None: spellings = [value for key,value in annotations.items() if key.name=='Orthography (default)'][0] transcriptions = [key for key in annotations if key.name == 'Transcription'][0] for index, info in enumerate(spellings): spelling = info[0] #info[1] is the start time, info[2] is the end time (or else None) try: transcription = support_corpus.find(spelling, ignore_case=ignore_case).transcription except KeyError: try: no_punctuation = ''.join([x for x in spelling if not x in string.punctuation]) transcription = support_corpus.find(no_punctuation, ignore_case=ignore_case).transcription except KeyError: transcription = Transcription([symbol for symbol in spelling]) annotations[transcriptions].append((transcription, index, index+1)) discourse_kwargs = {'name': corpus_name, 'wav_path': wav_path, 'other_attributes': list()} for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute elif at.name == 'Transcription (default)': discourse_kwargs['transcription_name'] = at.attribute elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) if 'spelling_name' not in discourse_kwargs: discourse_kwargs['spelling_name'] = Attribute('Spelling', 'spelling', 'Spelling') if 'transcription_name' not in discourse_kwargs: discourse_kwargs['transcription_name'] = Attribute('Transcription', 'tier', 'Transcription') if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) discourse = Discourse(discourse_kwargs) if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]: # running text will not have a frequency attribute supplied by the user # textgrids are also unlikely to have this attribute discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency')) add_frequency = True else: add_frequency = False ind = 0 limit = max([len(list(v)) for v in annotations.values()]) for n in range(limit): if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) word_kwargs = dict() for at in annotations: if at.token or at.ignored: continue else: try: word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0]) #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None) except IndexError: word_kwargs[at.attribute.name] = (at.attribute, None) word = Word(**word_kwargs) try: word = discourse.lexicon.find(word.spelling) if add_frequency: word.frequency += 1 except KeyError: discourse.lexicon.add_word(word) word_token_kwargs = dict() word_token_kwargs['word'] = word begin, end = None, None for at in annotations: if at.ignored: continue try: word_token_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0]) # annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None) except IndexError: word_token_kwargs[at.attribute.name] = (at.attribute, None) #word_token_kwargs[at.output_name] = (at.attribute, annotations[at][n][0]) if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = annotations[at][n][1] end = annotations[at][n][2] word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, annotations[at][n][0]) word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 word_token = WordToken(**word_token_kwargs) discourse.add_word(word_token) if any(a.token for a in annotations): word.wordtokens.append(word_token) ind += 1 return discourse
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as a column-delimited file Parameters ---------- path : str Full path to text file num_lines: int, optional The number of lines to parse from the file coldelim: str, optional A prespecified column delimiter to use, will autodetect if not supplied transdelim : list, optional A prespecfied set of transcription delimiters to look for, will autodetect if not supplied Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ if coldelim is not None: common_delimiters = [coldelim] else: common_delimiters = [',','\t',':','|'] if transdelim is not None: trans_delimiters = [transdelim] else: trans_delimiters = ['.',' ', ';', ','] with open(path,'r', encoding='utf-8') as f: lines = [] head = f.readline().strip() for line in f.readlines(): lines.append(line.strip()) #for i in range(num_lines): # line = f.readline() # if not line: # break # lines.append(line) best = '' num = 1 for d in common_delimiters: trial = len(head.split(d)) if trial > num: num = trial best = d if best == '': raise(DelimiterError('The column delimiter specified did not create multiple columns.')) head = head.split(best) vals = {h: list() for h in head} for line in lines: l = line.strip().split(best) if len(l) != len(head): raise(PCTError('{}, {}'.format(l,head))) for i in range(len(head)): vals[head[i]].append(l[i]) atts = list() for h in head: cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters) att = Attribute(Attribute.sanitize_name(h), cat, h) a = AnnotationType(h, None, None, token = False, attribute = att) if cat == 'tier': for t in trans_delimiters: if t in vals[h][0]: a.trans_delimiter = t break a.add(vals[h], save = False) atts.append(a) return atts, best
def spelling_annotation_type(): a = AnnotationType('test', None, None) a.attribute = Attribute('test', 'spelling') return a
def transcription_annotation_type(): a = AnnotationType('test', None, None) a.trans_delimiter = '.' a.attribute = Attribute('test', 'tier') return a
def numeric_annotation_type(): a = AnnotationType('test', None, None) a.attribute = Attribute('test', 'numeric') return a
def inspect_discourse_textgrid(path): """ Generate a list of AnnotationTypes for a specified TextGrid file Parameters ---------- path : str Full path to TextGrid file Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the TextGrid file """ trans_delimiters = ['.', ' ', ';', ','] textgrids = [] if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.textgrid'): continue textgrids.append(os.path.join(root, filename)) else: textgrids.append(path) anno_types = [] for t in textgrids: tg = load_textgrid(t) spellings, segments, attributes = guess_tiers(tg) if len(segments) == 0: base = None else: base = segments[0] if len(spellings) == 0: anchor = None else: anchor = spellings[0] interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)] if len(anno_types) == 0: for ti in interval_tiers: if ti.name in spellings: a = AnnotationType(ti.name, base, None, anchor=True, token=False) elif ti.name in segments: a = AnnotationType(ti.name, None, anchor, base=True, token=True) else: labels = uniqueLabels(ti) cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name) a = AnnotationType(ti.name, None, anchor, token=False, attribute=att) if cat == 'tier': for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add((x.mark for x in ti), save=False) anno_types.append(a) else: if len(anno_types) != len(interval_tiers): raise (PCTError( "The TextGrids must have the same number of tiers.")) for i, ti in enumerate(interval_tiers): anno_types[i].add((x.mark for x in ti), save=False) return anno_types
def generateKwargs(self): if self.useQuadratic.isChecked() and int( self.maxDistanceEdit.text()) != 1: self.useQuadratic.setChecked(False) if self.maxDistanceEdit.text() == '': max_distance = None else: max_distance = float(self.maxDistanceEdit.text()) ##------------------ try: frequency_cutoff = float(self.minFreqEdit.text()) except ValueError: frequency_cutoff = 0.0 ##------------------- alg = self.algorithmWidget.value() typeToken = self.typeTokenWidget.value() if self.fileRadio.isChecked(): #using list of words not in corpus file_type = self.fileOptions.currentText().split(' ')[-1].strip() for tiername in [ self.tierWidget.tierSelect.itemText(i) for i in range(self.tierWidget.tierSelect.count()) ]: if tiername == file_type: self.tierWidget.tierSelect.setCurrentText(tiername) break kwargs = {'corpusModel':self.corpusModel, 'algorithm': alg, 'context': self.variantsWidget.value(), 'sequence_type':self.tierWidget.value(),#this is just a string 'tier_type': self.tierWidget.attValue(),#this is an Attribute type object 'type_token':typeToken, 'max_distance':max_distance, 'frequency_cutoff':frequency_cutoff, 'num_cores':self.settings['num_cores'], 'force_quadratic': self.useQuadratic.isChecked(), 'file_type': self.fileOptions.currentText().split()[-1], 'collapse_homophones': self.collapseHomophones.isChecked(), 'output_format': self.saveFileFormat.currentText().split(' ')[-1].lower(),\ 'in_corpus': True} out_file = self.saveFileWidget.value() if out_file == '': out_file = None else: kwargs['output_filename'] = out_file kwargs['file_list'] = None if self.compType is None: reply = QMessageBox.critical( self, "Missing information", 'Please select an option from the "Query" section in the middle of the window.' ) return elif self.compType == 'one': text = self.oneWordEdit.text() if not text: reply = QMessageBox.critical(self, "Missing information", "Please specify a word.") return try: w = self.corpusModel.corpus.find(text) except KeyError: reply = QMessageBox.critical( self, "Invalid information", "The spelling specified does match any words in the corpus." ) return kwargs['query'] = [w] kwargs['output_filename'] = out_file elif self.compType == 'nonword': if self.oneNonword is None: reply = QMessageBox.critical(self, "Missing information", "Please create a word/nonword.") return if not getattr(self.oneNonword, kwargs['sequence_type']): reply = QMessageBox.critical( self, "Missing information", "Please recreate the word/nonword with '{}' specified.". format(self.tierWidget.displayValue())) return kwargs['query'] = [self.oneNonword] kwargs['in_corpus'] = False kwargs['output_filename'] = out_file elif self.compType == 'file': path = self.fileWidget.value() kwargs['file_list'] = path kwargs['in_corpus'] = False if not path: reply = QMessageBox.critical(self, "Missing information", "Please enter a file path.") return if not os.path.exists(path): reply = QMessageBox.critical( self, "Invalid information", "The file path entered was not found.") return kwargs['query'] = list() file_sequence_type = self.fileOptions.currentText().split( ' ')[-1].lower() text = load_words_neighden(path, file_sequence_type) for t in text: kwargs['query'].append(t) elif self.compType == 'all': column = self.columnEdit.text() if column == '': reply = QMessageBox.critical(self, "Missing information", "Please enter a column name.") return colName = column.replace(' ', '_') attribute = Attribute(colName, 'numeric', column) if column in self.corpusModel.columns: msgBox = QMessageBox( QMessageBox.Warning, "Duplicate columns", "'{}' is already the name of a column. Overwrite?".format( column), QMessageBox.NoButton, self) msgBox.addButton("Overwrite", QMessageBox.AcceptRole) msgBox.addButton("Cancel", QMessageBox.RejectRole) if msgBox.exec_() != QMessageBox.AcceptRole: return kwargs['attribute'] = attribute return kwargs
def inspect_discourse_textgrid(path): """ Generate a list of AnnotationTypes for a specified TextGrid file Parameters ---------- path : str Full path to TextGrid file Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the TextGrid file """ trans_delimiters = ['.',' ', ';', ','] textgrids = [] if os.path.isdir(path): for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.textgrid'): continue textgrids.append(os.path.join(root,filename)) else: textgrids.append(path) anno_types = [] for t in textgrids: tg = load_textgrid(t) spellings, segments, attributes = guess_tiers(tg) if len(segments) == 0: base = None else: base = segments[0] if len(spellings) == 0: anchor = None else: anchor = spellings[0] interval_tiers = [x for x in tg.tiers if isinstance(x, IntervalTier)] if len(anno_types) == 0: for ti in interval_tiers: if ti.name in spellings: a = AnnotationType(ti.name, base, None, anchor = True, token = False) elif ti.name in segments: a = AnnotationType(ti.name, None, anchor, base = True, token = True) else: labels = uniqueLabels(ti) cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(ti.name), cat, ti.name) a = AnnotationType(ti.name, None, anchor, token = False, attribute = att) if cat == 'tier': for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add((x.mark for x in ti), save = False) anno_types.append(a) else: if len(anno_types) != len(interval_tiers): raise(PCTError("The TextGrids must have the same number of tiers.")) for i, ti in enumerate(interval_tiers): anno_types[i].add((x.mark for x in ti), save = False) return anno_types
def inspect_discourse_ilg(path, number = None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as an interlinear gloss text file Parameters ---------- path : str Full path to text file number : int, optional Number of lines per gloss, if not supplied, it is auto-detected Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ trans_delimiters = ['.', ';', ','] lines = {} if os.path.isdir(path): numbers = {} for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue p = os.path.join(root, filename) lines[p] = text_to_lines(p) numbers[p] = calculate_lines_per_gloss(lines[p]) number = most_frequent_value(numbers) else: lines[path] = text_to_lines(path) number = calculate_lines_per_gloss(lines[path]) p = path annotation_types = [] for i in range(number): name = 'Line {}'.format(i+1) if i == 0: att = Attribute('spelling','spelling','Spelling') a = AnnotationType(name, None, None, anchor = True, token = False, attribute = att) else: labels = lines[p][i][1] cat = Attribute.guess_type(labels, trans_delimiters) att = Attribute(Attribute.sanitize_name(name), cat, name) a = AnnotationType(name, None, annotation_types[0].name, token = False, attribute = att) if cat == 'tier' and a.trans_delimiter is None: for l in labels: for delim in trans_delimiters: if delim in l: a.trans_delimiter = delim break if a.trans_delimiter is not None: break a.add(lines[p][i][1], save = False) annotation_types.append(a) for k,v in lines.items(): if k == p: continue for i in range(number): labels = lines[k][i][1] annotation_types[i].add(labels, save = False) return annotation_types
def spelling_text_to_data(corpus_name, path, annotation_types = None, support_corpus_path = None, ignore_case = True, stop_check = None, call_back = None): name = corpus_name if annotation_types is None: annotation_types = inspect_discourse_spelling(path, support_corpus_path) if support_corpus_path is not None: if isinstance(support_corpus_path, Corpus): support = support_corpus_path else: if not os.path.exists(support_corpus_path): raise(PCTOSError("The corpus path specified ({}) does not exist".format(support_corpus_path))) support = load_binary(support_corpus_path) a = AnnotationType('Transcription', None, None, attribute=Attribute('Transcription', 'transcription', 'Transcription'), base=True, is_default=True) annotation_types.append(a) for a in annotation_types: a.reset() data = DiscourseData(name, annotation_types) lines = text_to_lines(path) if call_back is not None: call_back('Processing file...') call_back(0, len(lines)) cur = 0 for line in lines: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 20 == 0: call_back(cur) if not line or line == '\n': continue annotations = {} for word in line: spell = word.strip() spell = ''.join(x for x in spell if not x in data['Spelling'].ignored_characters) if spell == '': continue word = Annotation(spell) if support_corpus_path is not None: trans = None try: trans = support.find(spell, ignore_case = ignore_case).transcription except KeyError: trans = [] n = data.base_levels[0] tier_elements = [BaseAnnotation(x) for x in trans] level_count = data.level_length(n) word.references.append(n) word.begins.append(level_count) word.ends.append(level_count + len(tier_elements)) annotations[n] = tier_elements annotations['Spelling'] = [word] data.add_annotations(**annotations) return data
def generateKwargs(self): ##------------------ try: frequency_cutoff = float(self.minFreqEdit.text()) except ValueError: frequency_cutoff = 0.0 ##------------------- kwargs = { 'corpusModel': self.corpusModel, 'algorithm': self.algorithmWidget.value(), 'context': self.variantsWidget.value(), 'sequence_type': self.tierWidget.value(), 'type_token': self.typeTokenWidget.value(), 'frequency_cutoff': frequency_cutoff, 'probability_type': self.probabilityTypeWidget.value() } if self.compType is None: reply = QMessageBox.critical(self, "Missing information", "Please specify a comparison type.") return elif self.compType == 'one': text = self.oneWordEdit.text() if not text: reply = QMessageBox.critical(self, "Missing information", "Please specify a word.") return try: w = self.corpusModel.corpus.find(text) except KeyError: reply = QMessageBox.critical( self, "Invalid information", "The spelling specified does match any words in the corpus." ) return kwargs['query'] = [w] elif self.compType == 'nonword': if self.oneNonword is None: reply = QMessageBox.critical(self, "Missing information", "Please create a word/nonword.") return if not getattr(self.oneNonword, kwargs['sequence_type']): reply = QMessageBox.critical( self, "Missing information", "Please recreate the word/nonword with '{}' specified.". format(self.tierWidget.displayValue())) return kwargs['query'] = [self.oneNonword] elif self.compType == 'file': path = self.fileWidget.value() if not path: reply = QMessageBox.critical(self, "Missing information", "Please enter a file path.") return if not os.path.exists(path): reply = QMessageBox.critical( self, "Invalid information", "The file path entered was not found.") return kwargs['query'] = list() text = load_words_neighden(path) for t in text: if isinstance(t, str): try: w = self.corpusModel.corpus.find(t) except KeyError: reply = QMessageBox.critical( self, "Invalid information", "The spelling '{}' was not found in the corpus.". format(t)) return kwargs['query'].append(w) elif self.compType == 'all': column = self.columnEdit.text() if column == '': reply = QMessageBox.critical(self, "Missing information", "Please enter a column name.") return colName = column.replace(' ', '_') attribute = Attribute(colName, 'numeric', column) if column in self.corpusModel.columns: msgBox = QMessageBox( QMessageBox.Warning, "Duplicate columns", "'{}' is already the name of a column. Overwrite?".format( column), QMessageBox.NoButton, self) msgBox.addButton("Overwrite", QMessageBox.AcceptRole) msgBox.addButton("Cancel", QMessageBox.RejectRole) if msgBox.exec_() != QMessageBox.AcceptRole: return kwargs['attribute'] = attribute return kwargs
def transcription_text_to_data(corpus_name, path, annotation_types=None, stop_check=None, call_back=None): name = corpus_name if annotation_types is None: annotation_types = inspect_discourse_transcription(path) for a in annotation_types: a.reset() a = AnnotationType('Spelling', None, None, attribute=Attribute('Spelling', 'spelling', 'Spelling'), anchor=True) annotation_types.append(a) data = DiscourseData(name, annotation_types) lines = text_to_lines(path) if call_back is not None: call_back('Processing file...') call_back(0, len(lines)) cur = 0 trans_check = False n = 'Transcription' for line in lines: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 if cur % 20 == 0: call_back(cur) if not line or line == '\n': continue for word in line: annotations = dict() trans = parse_transcription(word, data[n]) #if not trans_check and data[n].delimiter is not None and len(trans) > 1: # trans_check = True spell = ''.join(x.label for x in trans) if spell == '': continue word = Annotation(spell) tier_elements = trans level_count = data.level_length(n) word.references.append(n) word.begins.append(level_count) word.ends.append(level_count + len(tier_elements)) tier_elements[0].begin = level_count tier_elements[-1].end = level_count + len(tier_elements) annotations[n] = tier_elements annotations['Spelling'] = [word] data.add_annotations(**annotations) #if data[n].delimiter and not trans_check: # raise(DelimiterError('The transcription delimiter specified does not create multiple segments. Please specify another delimiter.')) return data