def text_to_lines(path): delimiter = None with open(path, encoding='utf-8-sig', mode='r') as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError('The delimiter specified does not create multiple words. Please specify another delimiter.') raise(e) lines = [x.strip().split(delimiter) for x in text.splitlines() if x.strip() != ''] return lines
def check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None, stop_check=None, call_back=None): if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) feature_matrix = modernize.modernize_specifier(feature_matrix) if annotation_types is None: annotation_types, delimiter = inspect_csv(path, coldelim=delimiter) for a in annotation_types: a.reset() missing = set() with open(path, encoding='utf-8-sig') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers) == 1: e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches ' 'the one used in the file.')) raise e headers = annotation_types for line in f.readlines(): line = line.strip() if not line: continue for k, v in zip(headers, line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': ignored = k.ignored_characters if ignored is not None: v = ''.join(x for x in v if x not in ignored) sd = k.syllable_delimiter if sd is not None: syllables = v.split(sd) else: syllables = [v] td = k.trans_delimiter stress_spec = set(k.stress_specification.keys()) tone_spec = set(k.tone_specification.keys()) supra_spec = stress_spec.union(tone_spec) for syllable in syllables: syllable = ''.join(x for x in syllable if x not in supra_spec) if td is None: if k.digraph_pattern is not None: string = k.digraph_pattern.findall(syllable) else: string = [x for x in syllable] else: string = syllable.split(td) for seg in string: if seg == '': continue if seg not in feature_matrix.segments: missing.add(seg) print('In csv.py', missing)
def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None): """ Generate a list of AnnotationTypes for a specified text file for parsing it as a column-delimited file Parameters ---------- path : str Full path to text file num_lines: int, optional The number of lines to parse from the file coldelim: str, optional A prespecified column delimiter to use, will autodetect if not supplied transdelim : list, optional A prespecfied set of transcription delimiters to look for, will autodetect if not supplied Returns ------- list of AnnotationTypes Autodetected AnnotationTypes for the text file """ if coldelim is not None: common_delimiters = [coldelim] else: common_delimiters = [',','\t',':','|'] if transdelim is not None: trans_delimiters = [transdelim] else: trans_delimiters = ['.',' ', ';', ','] with open(path,'r', encoding='utf-8-sig') as f: lines = [] head = f.readline().strip() for line in f.readlines(): lines.append(line.strip()) best = '' num = 1 for d in common_delimiters: trial = len(head.split(d)) if trial > num: num = trial best = d if best == '': raise(DelimiterError('The column delimiter specified did not create multiple columns.')) head = head.split(best) vals = {h: list() for h in head} for line in lines: l = line.strip().split(best) if len(l) != len(head): raise(PCTError('{}, {}'.format(l,head))) for i in range(len(head)): vals[head[i]].append(l[i]) atts = list() for h in head: if h in ['Transcription', 'transcription']: cat = 'tier' else: cat = Attribute.guess_type(vals[h][:num_lines], trans_delimiters) att = Attribute(Attribute.sanitize_name(h), cat, h) a = AnnotationType(h, None, None, token=False, attribute=att) if cat == 'tier': for t in trans_delimiters: if t in vals[h][0] or t in vals[h][-1]: a.trans_delimiter = t break a.add(vals[h], save = False) atts.append(a) return atts, best
def load_corpus_csv(corpus_name, path, delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types, feature_system_path, stop_check, call_back) corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) feature_matrix = modernize.modernize_specifier(feature_matrix) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, delimiter = inspect_csv(path, coldelim=delimiter) for a in annotation_types: a.reset() if call_back is not None: call_back('Loading...') call_back(0, 0) cur = 0 with open(path, encoding='utf-8-sig') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers) == 1: e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches ' 'the one used in the file.')) raise e headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = True for line in f.readlines(): if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) line = line.strip() if not line: # blank or just a newline continue d = {} for k, v in zip(headers, line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k, feature_matrix=feature_matrix, corpus=corpus) # trans is a list of BaseAnnotation if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters if not word.spelling: word.spelling = ''.join(map(str, word.transcription)) corpus.add_word(word, allow_duplicates=True) if corpus.specifier is not None: corpus.inventory.update_features(corpus.specifier) if corpus.has_transcription and any(len(word.transcription) > 1 for word in corpus): if not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\nCheck that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise e if stop_check is not None and stop_check(): return return corpus
def load_corpus_csv(corpus_name, path, delimiter, trans_delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns trans_delimiter : str Character to use for spliting transcriptions into segments annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(best_delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(best_delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus