Esempio n. 1
0
    def __init__(self, fileid, sent, tag, strip_space, stem):
        """
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        if sent:
            tagspec = '.*/s'
        else:
            tagspec = '.*/s/(.*/)?(c|w)'
        self._sent = sent
        self._tag = tag
        self._strip_space = strip_space
        self._stem = stem

        self.title = None  #: Title of the document.
        self.author = None  #: Author of the document.
        self.editor = None  #: Editor
        self.resps = None  #: Statement of responsibility

        XMLCorpusView.__init__(self, fileid, tagspec)

        # Read in a tasty header.
        self._open()
        self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
        self.close()

        # Reset tag context.
        self._tag_context = {0: ()}
Esempio n. 2
0
 def __init__(self, filename, **kwargs):
     self.tags = kwargs.pop('tags', None)
     self.tagspec = '.*/seg/fs'
     self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
     XMLCorpusView.__init__(
         self, self.xml_tool.build_preprocessed_file(), self.tagspec
     )
Esempio n. 3
0
    def __init__(self, fileid, sent, tag, strip_space, stem):
        """
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        if sent:
            tagspec = ".*/s"
        else:
            tagspec = ".*/s/(.*/)?(c|w)"
        self._sent = sent
        self._tag = tag
        self._strip_space = strip_space
        self._stem = stem

        self.title = None  #: Title of the document.
        self.author = None  #: Author of the document.
        self.editor = None  #: Editor
        self.resps = None  #: Statement of responsibility

        XMLCorpusView.__init__(self, fileid, tagspec)

        # Read in a tasty header.
        self._open()
        self.read_block(self._stream, ".*/teiHeader$", self.handle_header)
        self.close()

        # Reset tag context.
        self._tag_context = {0: ()}
Esempio n. 4
0
 def __init__(self, filename, **kwargs):
     self.tags = kwargs.pop('tags', None)
     self.tagspec = '.*/seg/fs'
     self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
     XMLCorpusView.__init__(
         self, self.xml_tool.build_preprocessed_file(), self.tagspec
     )
Esempio n. 5
0
 def __init__(self, filename, **kwargs):
     self.mode = kwargs.pop('mode', 0)
     self.tagspec = '.*/div/ab'
     self.segm_dict = dict()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'text.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Esempio n. 6
0
 def __init__(self, filename, **kwargs):
     """
     HEADER_MODE
     A stream backed corpus view specialized for use with
     header.xml files in NKJP corpus.
     """
     self.tagspec = ".*/sourceDesc$"
     XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
Esempio n. 7
0
 def __init__(self, filename, **kwargs):
     """
     HEADER_MODE
     A stream backed corpus view specialized for use with
     header.xml files in NKJP corpus.
     """
     self.tagspec = ".*/sourceDesc$"
     XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
Esempio n. 8
0
 def __init__(self, filename, **kwargs):
     self.mode = kwargs.pop('mode', 0)
     self.tagspec = '.*/div/ab'
     self.segm_dict = dict()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'text.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Esempio n. 9
0
 def __init__(self, filename, **kwargs):
     self.tagspec = '.*p/.*s'
     #intersperse NKJPCorpus_Text_View
     self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE)
     self.text_view.handle_query()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Esempio n. 10
0
 def __init__(self, filename, **kwargs):
     self.tagspec = '.*p/.*s'
     #intersperse NKJPCorpus_Text_View
     self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE)
     self.text_view.handle_query()
     #xml preprocessing
     self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
     #base class init
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
Esempio n. 11
0
 def read_block(self, stream, tagspec=None, elt_handler=None):
     return list(
         filter(
             lambda x: x is not None,
             XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
         )
     )
Esempio n. 12
0
 def read_block(self, stream, tagspec=None, elt_handler=None):
     return list(
         filter(
             lambda x: x is not None,
             XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
         )
     )
 def tagged_words(self, lemmatize=True):
     words = XMLCorpusView(self.path, '.*/w')
     if lemmatize:
         word_tags = [(word.text, word.attrib['pos'], self.get_lemma(word))
                      for word in words]
     else:
         word_tags = [(word.text, word.attrib['pos']) for word in words]
     return word_tags
 def sentences(self):
     """Returns a list of sentences where each sentence is a list of words
     """
     sents = XMLCorpusView(self.path, '.*/sentence')
     sent_list = list()
     for sentence in sents:
         word_list = [word.text for word in sentence]
         sent_list.append(word_list)
     return sent_list
Esempio n. 15
0
    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
        """
        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        """
        if bracket_sent: tagspec = '.*/s'
        else: tagspec = '.*/s/(punc|wf)'

        self._unit = unit
        self._sent = bracket_sent
        self._pos_tag = pos_tag
        self._sem_tag = sem_tag

        XMLCorpusView.__init__(self, fileid, tagspec)
Esempio n. 16
0
 def handle_query(self):
     self._open()
     header = []
     while True:
         segm = XMLCorpusView.read_block(self, self._stream)
         if len(segm) == 0:
             break
         header.extend(segm)
     self.close()
     return header
Esempio n. 17
0
    def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
        """
        :param fileid: The name of the underlying file.
        :param unit: One of `'token'`, `'word'`, or `'chunk'`.
        :param bracket_sent: If true, include sentence bracketing.
        :param pos_tag: Whether to include part-of-speech tags.
        :param sem_tag: Whether to include semantic tags, namely WordNet lemma
            and OOV named entity status.
        """
        if bracket_sent: tagspec = '.*/s'
        else: tagspec = '.*/s/(punc|wf)'

        self._unit = unit
        self._sent = bracket_sent
        self._pos_tag = pos_tag
        self._sem_tag = sem_tag
        self._wordnet = wordnet

        XMLCorpusView.__init__(self, fileid, tagspec)
Esempio n. 18
0
 def handle_query(self):
     self._open()
     header = []
     while True:
         segm = XMLCorpusView.read_block(self, self._stream)
         if len(segm) == 0:
             break
         header.extend(segm)
     self.close()
     return header
 def tagged_sentences(self, lemmatize=True):
     sents = XMLCorpusView(self.path, '.*/sentence')
     sent_list = list()
     for sent in sents:
         if lemmatize:
             word_list = [(word.text, word.attrib['pos'],
                           self.get_lemma(word)) for word in sent]
         else:
             word_list = [(word.text, word.attrib['pos']) for word in sent]
         sent_list.append(word_list)
     return sent_list
    def _detect_encoding(self, fileid):
        if isinstance(fileid, PathPointer): 
            s = fileid.open().readline() 
        else: 
            s = open(fileid, 'rb').readline()
        
        m = re.search(r'encoding="([^"]+)"', s)
        if m: return m.group(1)
        m = re.search(r"encoding='([^']+)'", s)
        if m: return m.group(1)

        return XMLCorpusView._detect_encoding(self, fileid)
Esempio n. 21
0
    def read_block(self, stream, tagspec=None, elt_handler=None):
        """
        Returns text as a list of sentences.
        """
        txt = []
        while True:
            segm = XMLCorpusView.read_block(self, stream)
            if len(segm) == 0:
                break
            for part in segm:
                txt.append(part)

        return [' '.join([segm for segm in txt])]
Esempio n. 22
0
    def read_block(self, stream, tagspec=None, elt_handler=None):
        """
        Returns text as a list of sentences.
        """
        txt = []
        while True:
            segm = XMLCorpusView.read_block(self, stream)
            if len(segm) == 0:
                break
            for part in segm:
                txt.append(part)

        return [' '.join([segm for segm in txt])]
Esempio n. 23
0
 def handle_query(self):
     try:
         self._open()
         words = []
         while True:
             segm = XMLCorpusView.read_block(self, self._stream)
             if len(segm) == 0:
                 break
             for part in segm:
                 if part is not None:
                     words.append(part)
         self.close()
         self.xml_tool.remove_preprocessed_file()
         return words
     except Exception:
         self.xml_tool.remove_preprocessed_file()
         raise Exception
Esempio n. 24
0
 def handle_query(self):
     try:
         self._open()
         sentences = []
         while True:
             sent_segm = XMLCorpusView.read_block(self, self._stream)
             if len(sent_segm) == 0:
                 break
             for segm in sent_segm:
                 segm = self.remove_choice(segm)
                 sentences.append(self.get_sentences(segm))
         self.close()
         self.xml_tool.remove_preprocessed_file()
         return sentences
     except Exception:
         self.xml_tool.remove_preprocessed_file()
         raise Exception
Esempio n. 25
0
 def handle_query(self):
     try:
         self._open()
         sentences = []
         while True:
             sent_segm = XMLCorpusView.read_block(self, self._stream)
             if len(sent_segm) == 0:
                 break
             for segm in sent_segm:
                 segm = self.remove_choice(segm)
                 sentences.append(self.get_sentences(segm))
         self.close()
         self.xml_tool.remove_preprocessed_file()
         return sentences
     except Exception:
         self.xml_tool.remove_preprocessed_file()
         raise Exception
Esempio n. 26
0
 def handle_query(self):
     try:
         self._open()
         words = []
         while True:
             segm = XMLCorpusView.read_block(self, self._stream)
             if len(segm) == 0:
                 break
             for part in segm:
                 if part is not None:
                     words.append(part)
         self.close()
         self.xml_tool.remove_preprocessed_file()
         return words
     except Exception:
         self.xml_tool.remove_preprocessed_file()
         raise Exception
Esempio n. 27
0
 def __init__(self, filename, **kwargs):
     self.tags = kwargs.pop("tags", None)
     self.tagspec = ".*/seg/fs"
     self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml")
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(),
                            self.tagspec)
Esempio n. 28
0
 def __init__(self, filename, **kwargs):
     self.tagspec = '.*/seg/fs'
     self.xml_tool = XML_Tool(filename, 'ann_named.xml')
     XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(),
                            self.tagspec)
Esempio n. 29
0
    doc_2d = []
    for doc, file in zip(matrix, filenames):                # reduce the data to 2 dimensions
        #print(file, "\n", doc, "\n\n")    # debug msg
        doc_2d.append(TSNE().fit_transform(doc).tolist()[0])

    matrix = np.asarray(doc_2d)                             # update matrix array


    # raw output
    np.savetxt('lsa_reduced.csv', matrix, delimiter='\t')  # raw output


    # build list of tags from the metadata
    metadata = pd.DataFrame(index=filenames, columns=['Tags'])

    view = XMLCorpusView('txt/export-abstracts.xml', '.*/article')
    iter = view.iterate_from(0)
    for entry in iter:
        metadata.loc[entry.attrib['{http://www.w3.org/XML/1998/namespace}id']+'.txt', 'Tags'] = entry.attrib['type']

    metadata.to_csv('lsa_metadata.csv')


    ##############################################################################
    # CLUSTERING

    print("clustering ...\n")

    #af = AffinityPropagation(damping=0.9, affinity="euclidean", preference=-50).fit(matrix)
    af = AffinityPropagation().fit(matrix)                  # default
Esempio n. 30
0
 def __init__(self, fileid, tagspec, elt_handler=None):
     XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
Esempio n. 31
0
 def __init__(self, fileid, tagspec, elt_handler=None):
     XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
Esempio n. 32
0
    np.savetxt("lsa_model.csv", matrix, delimiter="\t")  # raw output

    doc_2d = []
    for doc, file in zip(matrix, filenames):  # reduce the data to 2 dimensions
        # print(file, "\n", doc, "\n\n")    # debug msg
        doc_2d.append(TSNE().fit_transform(doc).tolist()[0])

    matrix = np.asarray(doc_2d)  # update matrix array

    # raw output
    np.savetxt("lsa_reduced.csv", matrix, delimiter="\t")  # raw output

    # build list of tags from the metadata
    metadata = pd.DataFrame(index=filenames, columns=["Tags"])

    view = XMLCorpusView("txt/export-abstracts.xml", ".*/article")
    iter = view.iterate_from(0)
    for entry in iter:
        metadata.loc[entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] + ".txt", "Tags"] = entry.attrib["type"]

    metadata.to_csv("lsa_metadata.csv")

    ##############################################################################
    # CLUSTERING

    print("clustering ...\n")

    # af = AffinityPropagation(damping=0.9, affinity="euclidean", preference=-50).fit(matrix)
    af = AffinityPropagation().fit(matrix)  # default

    cluster_centers_indices = af.cluster_centers_indices_