Exemple #1
0
def ilg_text_to_lines(path):
    """
    Converts an ilg file to text lines

    Parameters
    ----------
    path : string
        path to ilg file

    Returns 
    -------
    list
        a sanitized list of lines in the file
    """
    delimiter = None
    with open(path, encoding='utf-8-sig', mode='r') as f:
        text = f.read()
        if delimiter is not None and delimiter not in text:
            e = DelimiterError(
                'The delimiter specified does not create multiple words. Please specify another delimiter.'
            )
            raise (e)
    lines = enumerate(text.splitlines())
    lines = [(x[0], x[1].strip().split(delimiter)) for x in lines
             if x[1].strip() != '']
    return lines
Exemple #2
0
def text_to_lines(path):
    """
    Parse a text file into lines.

    Parameters
    ----------
    path : str
        Fully specified path to text file

    Returns
    -------
    list
        Non-empty lines in the text file
    """
    delimiter = None
    with open(path, encoding='utf-8-sig', mode='r') as f:
        text = f.read()
        if delimiter is not None and delimiter not in text:
            e = DelimiterError(
                'The delimiter specified does not create multiple words. Please specify another delimiter.'
            )
            raise (e)
    lines = [
        x.strip().split(delimiter) for x in text.splitlines()
        if x.strip() != ''
    ]
    return lines
Exemple #3
0
    def parse_discourse(self, path):
        '''
        Parse a CSV file for later importing.

        Parameters
        ----------
        path : str
            Path to CSV file

        Returns
        -------
        :class:`~polyglotdb.io.discoursedata.DiscourseData`
            Parsed data from the file
        '''
        name = os.path.splitext(os.path.split(path)[1])[0]
        for a in self.annotation_types:
            if a.name == 'transcription' and not isinstance(
                    a, TranscriptionAnnotationType):
                raise (CorpusIntegrityError(
                    ('The column \'{}\' is currently '
                     'not being parsed as transcriptions '
                     'despite its name.  Please ensure correct '
                     'parsing for this column by changing its '
                     '\'Annotation type\' in the parsing '
                     'preview to the right.').format(a.name)))
        for a in self.annotation_types:
            a.reset()
        with open(path, encoding='utf-8') as f:
            headers = f.readline()
            headers = headers.split(self.column_delimiter)
            if len(headers) == 1:
                e = DelimiterError(('Could not parse the corpus.\n\Check '
                                    'that the delimiter you typed in matches '
                                    'the one used in the file.'))
                raise (e)

            for line in f.readlines():
                line = line.strip()
                if not line:  #blank or just a newline
                    continue
                d = {}
                for i, (k, v) in enumerate(
                        zip(headers, line.split(self.column_delimiter))):
                    v = v.strip()
                    self.annotation_types[i].add([(v, )])

        pg_annotations = self._parse_annotations()

        data = DiscourseData(name, pg_annotations, self.hierarchy)
        for a in self.annotation_types:
            a.reset()

        return data