def test_coverage(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_no_d_info) corpus.set_feature_matrix(fm) self.assertEqual(corpus.check_coverage(), ['d'])
def test_coverage(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test',self.feature_no_d_info) corpus.set_feature_matrix(fm) self.assertEqual(corpus.check_coverage(),['d'])
def load_corpus_csv(corpus_name, path, delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, _ = inspect_csv(path, coldelim = delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus
def load_corpus_csv(corpus_name, path, delimiter, trans_delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns trans_delimiter : str Character to use for spliting transcriptions into segments annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(best_delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(best_delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus