コード例 #1
0
    def test_coverage(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_no_d_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(corpus.check_coverage(), ['d'])
コード例 #2
0
    def test_coverage(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_no_d_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(corpus.check_coverage(),['d'])
コード例 #3
0
ファイル: csv.py プロジェクト: FieldDB/CorpusTools
def load_corpus_csv(corpus_name, path, delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, _ = inspect_csv(path, coldelim = delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus
コード例 #4
0
def load_corpus_csv(corpus_name, path, delimiter,
                    trans_delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    trans_delimiter : str
        Character to use for spliting transcriptions into segments
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(best_delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(best_delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus