def __init__(self, fileid, sent, tag, strip_space, stem): """ :param fileid: The name of the underlying file. :param sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ if sent: tagspec = '.*/s' else: tagspec = '.*/s/(.*/)?(c|w)' self._sent = sent self._tag = tag self._strip_space = strip_space self._stem = stem self.title = None #: Title of the document. self.author = None #: Author of the document. self.editor = None #: Editor self.resps = None #: Statement of responsibility XMLCorpusView.__init__(self, fileid, tagspec) # Read in a tasty header. self._open() self.read_block(self._stream, '.*/teiHeader$', self.handle_header) self.close() # Reset tag context. self._tag_context = {0: ()}
def __init__(self, filename, **kwargs): self.tags = kwargs.pop('tags', None) self.tagspec = '.*/seg/fs' self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml') XMLCorpusView.__init__( self, self.xml_tool.build_preprocessed_file(), self.tagspec )
def __init__(self, fileid, sent, tag, strip_space, stem): """ :param fileid: The name of the underlying file. :param sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ if sent: tagspec = ".*/s" else: tagspec = ".*/s/(.*/)?(c|w)" self._sent = sent self._tag = tag self._strip_space = strip_space self._stem = stem self.title = None #: Title of the document. self.author = None #: Author of the document. self.editor = None #: Editor self.resps = None #: Statement of responsibility XMLCorpusView.__init__(self, fileid, tagspec) # Read in a tasty header. self._open() self.read_block(self._stream, ".*/teiHeader$", self.handle_header) self.close() # Reset tag context. self._tag_context = {0: ()}
def __init__(self, filename, **kwargs): self.mode = kwargs.pop('mode', 0) self.tagspec = '.*/div/ab' self.segm_dict = dict() #xml preprocessing self.xml_tool = XML_Tool(filename, 'text.xml') #base class init XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
def __init__(self, filename, **kwargs): """ HEADER_MODE A stream backed corpus view specialized for use with header.xml files in NKJP corpus. """ self.tagspec = ".*/sourceDesc$" XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
def __init__(self, filename, **kwargs): self.tagspec = '.*p/.*s' #intersperse NKJPCorpus_Text_View self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE) self.text_view.handle_query() #xml preprocessing self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml') #base class init XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
def read_block(self, stream, tagspec=None, elt_handler=None): return list( filter( lambda x: x is not None, XMLCorpusView.read_block(self, stream, tagspec, elt_handler), ) )
def tagged_words(self, lemmatize=True): words = XMLCorpusView(self.path, '.*/w') if lemmatize: word_tags = [(word.text, word.attrib['pos'], self.get_lemma(word)) for word in words] else: word_tags = [(word.text, word.attrib['pos']) for word in words] return word_tags
def sentences(self): """Returns a list of sentences where each sentence is a list of words """ sents = XMLCorpusView(self.path, '.*/sentence') sent_list = list() for sentence in sents: word_list = [word.text for word in sentence] sent_list.append(word_list) return sent_list
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag): """ :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ if bracket_sent: tagspec = '.*/s' else: tagspec = '.*/s/(punc|wf)' self._unit = unit self._sent = bracket_sent self._pos_tag = pos_tag self._sem_tag = sem_tag XMLCorpusView.__init__(self, fileid, tagspec)
def handle_query(self): self._open() header = [] while True: segm = XMLCorpusView.read_block(self, self._stream) if len(segm) == 0: break header.extend(segm) self.close() return header
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet): """ :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ if bracket_sent: tagspec = '.*/s' else: tagspec = '.*/s/(punc|wf)' self._unit = unit self._sent = bracket_sent self._pos_tag = pos_tag self._sem_tag = sem_tag self._wordnet = wordnet XMLCorpusView.__init__(self, fileid, tagspec)
def tagged_sentences(self, lemmatize=True): sents = XMLCorpusView(self.path, '.*/sentence') sent_list = list() for sent in sents: if lemmatize: word_list = [(word.text, word.attrib['pos'], self.get_lemma(word)) for word in sent] else: word_list = [(word.text, word.attrib['pos']) for word in sent] sent_list.append(word_list) return sent_list
def _detect_encoding(self, fileid): if isinstance(fileid, PathPointer): s = fileid.open().readline() else: s = open(fileid, 'rb').readline() m = re.search(r'encoding="([^"]+)"', s) if m: return m.group(1) m = re.search(r"encoding='([^']+)'", s) if m: return m.group(1) return XMLCorpusView._detect_encoding(self, fileid)
def read_block(self, stream, tagspec=None, elt_handler=None): """ Returns text as a list of sentences. """ txt = [] while True: segm = XMLCorpusView.read_block(self, stream) if len(segm) == 0: break for part in segm: txt.append(part) return [' '.join([segm for segm in txt])]
def handle_query(self): try: self._open() words = [] while True: segm = XMLCorpusView.read_block(self, self._stream) if len(segm) == 0: break for part in segm: if part is not None: words.append(part) self.close() self.xml_tool.remove_preprocessed_file() return words except Exception: self.xml_tool.remove_preprocessed_file() raise Exception
def handle_query(self): try: self._open() sentences = [] while True: sent_segm = XMLCorpusView.read_block(self, self._stream) if len(sent_segm) == 0: break for segm in sent_segm: segm = self.remove_choice(segm) sentences.append(self.get_sentences(segm)) self.close() self.xml_tool.remove_preprocessed_file() return sentences except Exception: self.xml_tool.remove_preprocessed_file() raise Exception
def __init__(self, filename, **kwargs): self.tags = kwargs.pop("tags", None) self.tagspec = ".*/seg/fs" self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml") XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
def __init__(self, filename, **kwargs): self.tagspec = '.*/seg/fs' self.xml_tool = XML_Tool(filename, 'ann_named.xml') XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
doc_2d = [] for doc, file in zip(matrix, filenames): # reduce the data to 2 dimensions #print(file, "\n", doc, "\n\n") # debug msg doc_2d.append(TSNE().fit_transform(doc).tolist()[0]) matrix = np.asarray(doc_2d) # update matrix array # raw output np.savetxt('lsa_reduced.csv', matrix, delimiter='\t') # raw output # build list of tags from the metadata metadata = pd.DataFrame(index=filenames, columns=['Tags']) view = XMLCorpusView('txt/export-abstracts.xml', '.*/article') iter = view.iterate_from(0) for entry in iter: metadata.loc[entry.attrib['{http://www.w3.org/XML/1998/namespace}id']+'.txt', 'Tags'] = entry.attrib['type'] metadata.to_csv('lsa_metadata.csv') ############################################################################## # CLUSTERING print("clustering ...\n") #af = AffinityPropagation(damping=0.9, affinity="euclidean", preference=-50).fit(matrix) af = AffinityPropagation().fit(matrix) # default
def __init__(self, fileid, tagspec, elt_handler=None): XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
np.savetxt("lsa_model.csv", matrix, delimiter="\t") # raw output doc_2d = [] for doc, file in zip(matrix, filenames): # reduce the data to 2 dimensions # print(file, "\n", doc, "\n\n") # debug msg doc_2d.append(TSNE().fit_transform(doc).tolist()[0]) matrix = np.asarray(doc_2d) # update matrix array # raw output np.savetxt("lsa_reduced.csv", matrix, delimiter="\t") # raw output # build list of tags from the metadata metadata = pd.DataFrame(index=filenames, columns=["Tags"]) view = XMLCorpusView("txt/export-abstracts.xml", ".*/article") iter = view.iterate_from(0) for entry in iter: metadata.loc[entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] + ".txt", "Tags"] = entry.attrib["type"] metadata.to_csv("lsa_metadata.csv") ############################################################################## # CLUSTERING print("clustering ...\n") # af = AffinityPropagation(damping=0.9, affinity="euclidean", preference=-50).fit(matrix) af = AffinityPropagation().fit(matrix) # default cluster_centers_indices = af.cluster_centers_indices_