Beispiel #1
0
 def __init__(self, filename, **kwargs):
     self.tags = kwargs.pop('tags', None)
     self.tagspec = '.*/seg/fs'
     self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
     XMLCorpusView.__init__(
         self, self.xml_tool.build_preprocessed_file(), self.tagspec
     )
Beispiel #2
0
    def __init__(self, fileid, sent, tag, strip_space, stem):
        """
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        if sent:
            tagspec = ".*/s"
        else:
            tagspec = ".*/s/(.*/)?(c|w)"
        self._sent = sent
        self._tag = tag
        self._strip_space = strip_space
        self._stem = stem

        self.title = None  #: Title of the document.
        self.author = None  #: Author of the document.
        self.editor = None  #: Editor
        self.resps = None  #: Statement of responsibility

        XMLCorpusView.__init__(self, fileid, tagspec)

        # Read in a tasty header.
        self._open()
        self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
        self.close()

        # Reset tag context.
        self._tag_context = {0: ()}
Beispiel #3
0
    def __init__(self, fileid, sent, tag, strip_space, stem):
        """
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        if sent:
            tagspec = '.*/s'
        else:
            tagspec = '.*/s/(.*/)?(c|w)'
        self._sent = sent
        self._tag = tag
        self._strip_space = strip_space
        self._stem = stem

        self.title = None  #: Title of the document.
        self.author = None  #: Author of the document.
        self.editor = None  #: Editor
        self.resps = None  #: Statement of responsibility

        XMLCorpusView.__init__(self, fileid, tagspec)

        # Read in a tasty header.
        self._open()
        self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
        self.close()

        # Reset tag context.
        self._tag_context = {0: ()}
Beispiel #4
0
 def __init__(self, filename, **kwargs):
     self.tags = kwargs.pop('tags', None)
     self.tagspec = '.*/seg/fs'
     self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
     XMLCorpusView.__init__(
         self, self.xml_tool.build_preprocessed_file(), self.tagspec
     )
Beispiel #5
0
 def __init__(self, filename, **kwargs):
     """
     HEADER_MODE
     A stream backed corpus view specialized for use with
     header.xml files in NKJP corpus.
     """
     self.tagspec = ".*/sourceDesc$"
     XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
Beispiel #6
0
 def __init__(self, filename, **kwargs):
     self.mode = kwargs.pop('mode', 0)
     self.tagspec = '.*/div/ab'
     self.segm_dict = dict()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'text.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Beispiel #7
0
 def __init__(self, filename, **kwargs):
     """
     HEADER_MODE
     A stream backed corpus view specialized for use with
     header.xml files in NKJP corpus.
     """
     self.tagspec = ".*/sourceDesc$"
     XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
Beispiel #8
0
 def __init__(self, filename, **kwargs):
     self.mode = kwargs.pop('mode', 0)
     self.tagspec = '.*/div/ab'
     self.segm_dict = dict()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'text.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Beispiel #9
0
 def __init__(self, filename, **kwargs):
     self.tagspec = '.*p/.*s'
     #intersperse NKJPCorpus_Text_View
     self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE)
     self.text_view.handle_query()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Beispiel #10
0
 def __init__(self, filename, **kwargs):
     self.tagspec = '.*p/.*s'
     #intersperse NKJPCorpus_Text_View
     self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE)
     self.text_view.handle_query()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Beispiel #11
0
    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
        """
        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        """
        if bracket_sent: tagspec = '.*/s'
        else: tagspec = '.*/s/(punc|wf)'

        self._unit = unit
        self._sent = bracket_sent
        self._pos_tag = pos_tag
        self._sem_tag = sem_tag

        XMLCorpusView.__init__(self, fileid, tagspec)
Beispiel #12
0
    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
        """
        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        """
        if bracket_sent: tagspec = '.*/s'
        else: tagspec = '.*/s/(punc|wf)'

        self._unit = unit
        self._sent = bracket_sent
        self._pos_tag = pos_tag
        self._sem_tag = sem_tag
        self._wordnet = wordnet

        XMLCorpusView.__init__(self, fileid, tagspec)
Beispiel #13
0
 def __init__(self, fileid, tagspec, elt_handler=None):
     XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
Beispiel #14
0
 def __init__(self, fileid, tagspec, elt_handler=None):
     XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
Beispiel #15
0
 def __init__(self, filename, **kwargs):
     self.tags = kwargs.pop("tags", None)
     self.tagspec = ".*/seg/fs"
     self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(),
                            self.tagspec)
 def __init__(self, filename, **kwargs):
     self.tagspec = '.*/seg/fs'
     self.xml_tool = XML_Tool(filename, 'ann_named.xml')
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(),
                            self.tagspec)