def test_init(): word_tokens = [{'begin':0,'end':1,'word':Word(**{'spelling':'a','transcription':['a','b']})}, {'begin':1,'end':2,'word':Word(**{'spelling':'c','transcription':['c','a','b']})}, {'begin':2,'end':3,'word':Word(**{'spelling':'a','transcription':['a','b']})}, {'begin':3,'end':4,'word':Word(**{'spelling':'d','transcription':['a','d']})}] d = Discourse() for wt in word_tokens: d.add_word(WordToken(**wt))
def data_to_discourse(data, lexicon=None): attribute_mapping = data.mapping() d = Discourse(name=data.name, wav_path=data.wav_path) ind = 0 if lexicon is None: lexicon = d.lexicon for k, v in attribute_mapping.items(): a = data[k] if a.token and v not in d.attributes: d.add_attribute(v, initialize_defaults=True) if not a.token and v not in d.lexicon.attributes: lexicon.add_attribute(v, initialize_defaults=True) for level in data.word_levels: for i, s in enumerate(data[level]): word_kwargs = {"spelling": (attribute_mapping[level], s.label)} word_token_kwargs = {} if s.token is not None: for token_key, token_value in s.token.items(): att = attribute_mapping[token_key] word_token_kwargs[att.name] = (att, token_value) if s.additional is not None: for add_key, add_value in s.additional.items(): att = attribute_mapping[add_key] if data[add_key].token: word_token_kwargs[att.name] = (att, add_value) else: word_kwargs[att.name] = (att, add_value) for j, r in enumerate(s.references): if r in data and len(data[r]) > 0: seq = data[r][s.begins[j] : s.ends[j]] att = attribute_mapping[r] if data[r].token: word_token_kwargs[att.name] = (att, seq) if len(seq) > 0 and seq[0].begin is not None: word_token_kwargs["begin"] = seq[0].begin word_token_kwargs["end"] = seq[-1].end else: word_kwargs[att.name] = (att, seq) word = lexicon.get_or_create_word(**word_kwargs) word_token_kwargs["word"] = word if "begin" not in word_token_kwargs: word_token_kwargs["begin"] = ind word_token_kwargs["end"] = ind + 1 wordtoken = WordToken(**word_token_kwargs) word.frequency += 1 word.wordtokens.append(wordtoken) d.add_word(wordtoken) ind += 1 return d
def test_init(): word_tokens = [{ 'begin': 0, 'end': 1, 'word': { 'spelling': 'a', 'transcription': ['a', 'b'] }, 'following_token_time': 1 }, { 'begin': 1, 'end': 2, 'word': { 'spelling': 'c', 'transcription': ['c', 'a', 'b'] }, 'previous_token_time': 0, 'following_token_time': 2 }, { 'begin': 2, 'end': 3, 'word': { 'spelling': 'a', 'transcription': ['a', 'b'] }, 'previous_token_time': 1, 'following_token_time': 3 }, { 'begin': 3, 'end': 4, 'word': { 'spelling': 'd', 'transcription': ['a', 'd'] }, 'previous_token_time': 2, }] d = Discourse() for wt in word_tokens: w = d.lexicon.get_or_create_word(**wt['word']) w.frequency += 1 wt['word'] = w d.add_word(WordToken(**wt)) corpus = SpontaneousSpeechCorpus('', '') corpus.add_discourse(d) d = corpus.discourses[''] assert (d[0].wordtype.frequency == 2) assert (d[1].wordtype.frequency == 1)
def test_init(): word_tokens = [{ 'begin': 0, 'end': 1, 'word': Word(**{ 'spelling': 'a', 'transcription': ['a', 'b'] }) }, { 'begin': 1, 'end': 2, 'word': Word(**{ 'spelling': 'c', 'transcription': ['c', 'a', 'b'] }) }, { 'begin': 2, 'end': 3, 'word': Word(**{ 'spelling': 'a', 'transcription': ['a', 'b'] }) }, { 'begin': 3, 'end': 4, 'word': Word(**{ 'spelling': 'd', 'transcription': ['a', 'd'] }) }] d = Discourse() for wt in word_tokens: d.add_word(WordToken(**wt))
def test_init(): word_tokens = [{'begin':0,'end':1,'word':{'spelling':'a','transcription':['a','b']},'following_token_time':1}, {'begin':1,'end':2,'word':{'spelling':'c','transcription':['c','a','b']}, 'previous_token_time':0,'following_token_time':2}, {'begin':2,'end':3,'word':{'spelling':'a','transcription':['a','b']}, 'previous_token_time':1,'following_token_time':3}, {'begin':3,'end':4,'word':{'spelling':'d','transcription':['a','d']}, 'previous_token_time':2,}] d = Discourse() for wt in word_tokens: w = d.lexicon.get_or_create_word(**wt['word']) w.frequency += 1 wt['word'] = w d.add_word(WordToken(**wt)) corpus = SpontaneousSpeechCorpus('','') corpus.add_discourse(d) d = corpus.discourses[''] assert(d[0].wordtype.frequency == 2) assert(d[1].wordtype.frequency == 1)
def data_to_discourse(data, lexicon = None): attribute_mapping = data.mapping() d = Discourse(name = data.name, wav_path = data.wav_path) ind = 0 if lexicon is None: lexicon = d.lexicon for k,v in attribute_mapping.items(): a = data[k] if a.token and v not in d.attributes: d.add_attribute(v, initialize_defaults = True) if not a.token and v not in d.lexicon.attributes: lexicon.add_attribute(v, initialize_defaults = True) for level in data.word_levels: for i, s in enumerate(data[level]): word_kwargs = {'spelling':(attribute_mapping[level], s.label)} word_token_kwargs = {} if s.token is not None: for token_key, token_value in s.token.items(): att = attribute_mapping[token_key] word_token_kwargs[att.name] = (att, token_value) if s.additional is not None: for add_key, add_value in s.additional.items(): att = attribute_mapping[add_key] if data[add_key].token: word_token_kwargs[att.name] = (att, add_value) else: word_kwargs[att.name] = (att, add_value) for j, r in enumerate(s.references): if r in data and len(data[r]) > 0: seq = data[r][s.begins[j]:s.ends[j]] att = attribute_mapping[r] if data[r].token: word_token_kwargs[att.name] = (att, seq) if len(seq) > 0 and seq[0].begin is not None: word_token_kwargs['begin'] = seq[0].begin word_token_kwargs['end'] = seq[-1].end else: word_kwargs[att.name] = (att, seq) word = lexicon.get_or_create_word(**word_kwargs) word_token_kwargs['word'] = word if 'begin' not in word_token_kwargs: word_token_kwargs['begin'] = ind word_token_kwargs['end'] = ind + 1 wordtoken = WordToken(**word_token_kwargs) word.frequency += 1 word.wordtokens.append(wordtoken) d.add_word(wordtoken) ind += 1 return d
def data_to_discourse(data, lexicon = None, call_back=None, stop_check=None): attribute_mapping = data.mapping() spelling_name, transcription_name = None, None for name, value in attribute_mapping.items(): if value.att_type == 'spelling' and value.is_default: spelling_name = name elif value.att_type == 'tier' and value.is_default: transcription_name = name dkwargs = {'spelling_name': spelling_name, 'transcription_name': transcription_name, 'name':data.name, 'wav_path':data.wav_path} d = Discourse(dkwargs) ind = 0 if lexicon is None: lexicon = d.lexicon #despite the name, this is a Corpus object for k,v in attribute_mapping.items(): a = data[v.name] if a.token and v not in d.attributes: d.add_attribute(v, initialize_defaults = True) if not a.token and v not in d.lexicon.attributes: lexicon.add_attribute(v, initialize_defaults = True) if call_back is not None: call_back('Processing data...') cur = 0 for level in data.word_levels: #word_levels is a list of spelling tiers, usually of length 1 if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) for i, s in enumerate(data[level]): #word_kwargs = {'spelling':(attribute_mapping[level], s.label)} if not s.label: continue word_kwargs = {level:(attribute_mapping[level], s.label)} word_token_kwargs = {} if s.token:# is not None: for token_key, token_value in s.token.items(): att = attribute_mapping[token_key] word_token_kwargs[att.name] = (att, token_value) if s.additional is not None: for add_key, add_value in s.additional.items(): att = attribute_mapping[add_key] if data[add_key].token: word_token_kwargs[att.name] = (att, add_value) else: word_kwargs[att.name] = (att, add_value) for j, r in enumerate(s.references): if r in data and len(data[r]) > 0: seq = data[r][s.begins[j]:s.ends[j]] att = attribute_mapping[r] if data[r].token: word_token_kwargs[att.name] = (att, seq) if len(seq) > 0 and seq[0].begin is not None: word_token_kwargs['begin'] = seq[0].begin word_token_kwargs['end'] = seq[-1].end else: word_kwargs[att.name] = (att, seq) word = lexicon.get_or_create_word(**word_kwargs) word_token_kwargs['word'] = word if 'begin' not in word_token_kwargs: word_token_kwargs['begin'] = ind word_token_kwargs['end'] = ind + 1 wordtoken = WordToken(**word_token_kwargs) word.frequency += 1 word.wordtokens.append(wordtoken) d.add_word(wordtoken) ind += 1 return d
def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, support_corpus=None, ignore_case=False, call_back=None, stop_check=None): curr_word = list() annotations = {at:list() for at in annotation_types} spelling_name, transcription_name = None, None if call_back is not None: call_back('Processing data...') cur = 0 for at in annotation_types: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) if all(isinstance(item, Annotation) for item in at._list): # it's a list of spellings, take each one and add it to the overall annotations list for item in at._list: if item.label: annotations[at].append((item.label, None, None)) elif all(type(item) == BaseAnnotation for item in at._list): #it's a list of transcriptions, with each segment as a BaseAnnotation for item in at._list: if item.begin is not None: begin = item.begin if item.end is None: curr_word.append(item) elif item.end is not None: end = item.end curr_word.append(item) curr_word = Transcription(curr_word) annotations[at].append((curr_word, begin, end)) curr_word = list() else: print(at._list) raise TypeError("AnnotationType._list cannot contain a mix of Annotations and BaseAnnotations") if support_corpus is not None: spellings = [value for key,value in annotations.items() if key.name=='Orthography (default)'][0] transcriptions = [key for key in annotations if key.name == 'Transcription'][0] for index, info in enumerate(spellings): spelling = info[0] #info[1] is the start time, info[2] is the end time (or else None) try: transcription = support_corpus.find(spelling, ignore_case=ignore_case).transcription except KeyError: try: no_punctuation = ''.join([x for x in spelling if not x in string.punctuation]) transcription = support_corpus.find(no_punctuation, ignore_case=ignore_case).transcription except KeyError: transcription = Transcription([symbol for symbol in spelling]) annotations[transcriptions].append((transcription, index, index+1)) discourse_kwargs = {'name': corpus_name, 'wav_path': wav_path, 'other_attributes': list()} for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute elif at.name == 'Transcription (default)': discourse_kwargs['transcription_name'] = at.attribute elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) if 'spelling_name' not in discourse_kwargs: discourse_kwargs['spelling_name'] = Attribute('Spelling', 'spelling', 'Spelling') if 'transcription_name' not in discourse_kwargs: discourse_kwargs['transcription_name'] = Attribute('Transcription', 'tier', 'Transcription') if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) discourse = Discourse(discourse_kwargs) if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]: # running text will not have a frequency attribute supplied by the user # textgrids are also unlikely to have this attribute discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency')) add_frequency = True else: add_frequency = False ind = 0 limit = max([len(list(v)) for v in annotations.values()]) for n in range(limit): if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) word_kwargs = dict() for at in annotations: if at.token or at.ignored: continue else: try: word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0]) #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None) except IndexError: word_kwargs[at.attribute.name] = (at.attribute, None) word = Word(**word_kwargs) try: word = discourse.lexicon.find(word.spelling) if add_frequency: word.frequency += 1 except KeyError: discourse.lexicon.add_word(word) word_token_kwargs = dict() word_token_kwargs['word'] = word begin, end = None, None for at in annotations: if at.ignored: continue try: word_token_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0]) # annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None) except IndexError: word_token_kwargs[at.attribute.name] = (at.attribute, None) #word_token_kwargs[at.output_name] = (at.attribute, annotations[at][n][0]) if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = annotations[at][n][1] end = annotations[at][n][2] word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, annotations[at][n][0]) word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 word_token = WordToken(**word_token_kwargs) discourse.add_word(word_token) if any(a.token for a in annotations): word.wordtokens.append(word_token) ind += 1 return discourse
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect, annotation_types=None, lexicon=None, feature_system_path=None, stop_check=None, call_back=None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus word_path : str Full path to words text file phone_path : str Full path to phones text file dialect : str Currently, only 'buckeye' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ name = os.path.splitext(os.path.split(word_path)[1])[0] discourse_kwargs = { 'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list() } for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute #.output_name elif at.name == 'Transcription (default)': discourse_kwargs[ 'transcription_name'] = at.attribute #.output_name elif at.name == 'Other (character)' or at.attribute.att_type in ( 'tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) discourse = Discourse(discourse_kwargs) words = read_words(word_path, dialect) ind = 0 for w in words: word_kwargs = { at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types } word = Word(**word_kwargs) word_token_kwargs = dict() for at in annotation_types: if at.ignored: continue word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name]) word_token_kwargs['word'] = word if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = w['begin'] end = w['end'] word_token_kwargs[ 'begin'] = begin if begin is not None else ind word_token_kwargs[ 'end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, w['transcription']) word_token = WordToken(**word_token_kwargs) word.wordtokens.append(word_token) discourse.lexicon.add_word(word) discourse.add_word(word_token) ind += 1 if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier( discourse.lexicon.specifier) return discourse
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect, annotation_types = None, lexicon = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus word_path : str Full path to words text file phone_path : str Full path to phones text file dialect : str Currently, only 'buckeye' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ name = os.path.splitext(os.path.split(word_path)[1])[0] discourse_kwargs = {'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list()} for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute#.output_name elif at.name == 'Transcription (default)': discourse_kwargs['transcription_name'] = at.attribute#.output_name elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) discourse = Discourse(discourse_kwargs) words = read_words(word_path, dialect) ind = 0 for w in words: word_kwargs = {at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types} word = Word(**word_kwargs) word_token_kwargs = dict() for at in annotation_types: if at.ignored: continue word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name]) word_token_kwargs['word'] = word if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = w['begin'] end = w['end'] word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, w['transcription']) word_token = WordToken(**word_token_kwargs) word.wordtokens.append(word_token) discourse.lexicon.add_word(word) discourse.add_word(word_token) ind += 1 if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier) return discourse