def __init__(self): """ :type jTree: jpype._jclass.edu.stanford.nlp.trees.LabeledScoredTreeNode :type sfp: StanfordParser """ self._aNodes = slist() self._sent_dep_wNodes = slist()
def test_StreamFileReading(self): sio = BytesIO() expected = slist(slist((i,)) for i in xrange(10)) expected.dumpToPickle(sio) sio = BytesIO(sio.getvalue()) result = stream.loadFromPickled(sio) self.assertEquals(list(expected), list(result))
def __init__(self, syntWord, aDependencies): """ :type syntWord: nlplib.spinlib.SyntWordNode :type aDependencies: list[GrammDep] """ assert isinstance(syntWord, SyntWordNode) assert isinstance(aDependencies, list) self._aGovDeps = slist([dp for dp in aDependencies if dp.gov.position == syntWord.position]) self._aSlaveDeps = slist([dp for dp in aDependencies if dp.slave.position == syntWord.position]) self._hDeps = {'gov': defaultdict(list), 'slave': defaultdict(list), } for dep, gov, slave in self._aGovDeps: self._hDeps['gov'][dep].append(slave) ###for for dep, gov, slave in self._aSlaveDeps: self._hDeps['slave'][dep].append(gov)
def segment_text(text): """ Segment raw text into sentences using Stanford DocumentProcessor :type text: str|unicode :rtype: slist[basestring] """ text = text.strip() reader = _JClasses.JStringReader(_JClasses.JString(text)) dp = _JClasses.JDocumentProcessor(reader) iterator = dp.iterator() sentences = slist() while iterator.hasNext(): sentence_array = next(iterator) tokens = [] for idx in range(sentence_array.size()): token = sentence_array[idx].toString() old_token = "" while old_token != token: old_token = token token = token.replace("\\", "") token = token.replace("\\/", "/") tokens.append(token) sentences.append(' '.join(tokens)) return sentences
def segment_text(text): """ Segment raw text into sentences using Stanford DocumentProcessor :type text: str|unicode :rtype: slist[basestring] """ text = text.strip() reader = _JClasses.JStringReader(_JClasses.JString(text)) dp = _JClasses.JDocumentProcessor(reader) iterator = dp.iterator() sentences = slist() while iterator.hasNext(): sentence_array = iterator.next() tokens = [] for idx in range(sentence_array.size()): token = sentence_array[idx].toString() old_token = "" while old_token != token: old_token = token token = token.replace("\\", "") token = token.replace("\\/", "/") tokens.append(token) sentences.append(' '.join(tokens)) return sentences
def tokenizeText(self, text: str) -> slist: """ Uses a sentence tokenizer, and tokenize obtained sentences with a TreeBank tokenizer. Replace unnormal quotes. """ sentences = self.__tokenizeToSentences(text) tokens = slist() for sent in sentences: sent = creReplaceNLs.sub(r' ', sent) tokens.extend( self._tb_tokenizer(sent) ) # Tokenize sentences using TreeBank tokenizer initialized upper in the __init__ function return tokens
def getEqDeps(self, depType, bGov=True): """ :type depType: DepType :type bGov: bool :rtype : slist[ SyntWordNode ] | None """ dtype = ('slave', 'gov')[bGov] hDeps = self._hDeps[dtype] aRet = slist() for t in depType.equivalent_names: if t in hDeps: aRet.extend(hDeps[t]) return aRet or None
def tokenizeText(self, text): """ Uses a sentence tokenizer, and tokenize obtained sentences with a TreeBank tokenizer. Replace unnormal quotes. :param text: :type text: str | unicode :rtype: slist """ sentences = self.__tokenizeToSentences(text) tokens = slist() for sent in sentences: sent = creReplaceNLs.sub(r' ', sent) tokens.extend(self._tb_tokenizer( sent)) # Tokenize sentences using TreeBank tokenizer initialized upper in the __init__ function return tokens
def getTaggedText(self) -> slist[Union[SyntToken, SyntWordToken]]: aText = slist() for i, token in enumerate(self._tokens): tag = CTags.fromString(token.tag_) if CTags.isWordType(tag): sn = SyntWordToken(text=token.text, tag=tag, stemmed=token.lemma_, idx=i) else: if token.is_punct or token.pos_ in ("SYM", "PUNCT"): tag = CTags.SYM elif tag in (CTags.CD, CTags.SYM, CTags.LS, CTags.UH): pass else: tag = CTags.fromString("OTHER_TAG") sn = SyntToken(text=token.text, tag=tag, idx=i) aText.append(sn) return aText
def tokenizeText(self, text: str) -> List[SpacySentence]: """ Uses a sentence tokenizer, and tokenize obtained sentences with Spacy tokenizer. Replace unnormal quotes. """ text = re.sub(r'[`\x92\x91]', r"'", text) text = re.sub(r'[\x93\x94\x95\x96\x85\xE9]', r'"', text) text = re.sub(r'[\x80-\xFF]', r' ', text) text = re.sub(r"([:\s])\'(.+?)\'([\s\.])", r'\1"\2"\3', text) text = re.sub(r"\s+", r' ', text) text = text.strip() doc: Doc = self._model(text) tokenized = slist() sent: Span for sent in doc.sents: ss = SpacySentence(list(sent), sent.text) tokenized.append(ss) return tokenized
def test_reduceUsesInitProperly(self): self.assertEquals(slist([sset((1, 2)), sset((3, 4))]).reduce(lambda x, y: x.update(y)), set((1, 2, 3, 4))) self.assertEquals(slist([sset((1, 2)), sset((3, 4))]).reduce(lambda x, y: x.update(y), sset()), set((1, 2, 3, 4)))
def iterativeParse_jTree(self, jTree, sfp): """ Parses the Tree object from JVM into Python VM Returns tuple (tree, dictionary of WordPositions) :type sfp: StanfordParser :type jTree: jpype._jclass.edu.stanford.nlp.trees.LabeledScoredTreeNode :rtype : PhraseNode, slist[SyntWordNode] """ level = 0 tree_index = 0 ptree = self._constructPhraseNode(CTags.fromString(str(jTree.value())), level, jTree, self, tree_index) """:type : CPhraseNode""" level += 1 # only root is level 0 tree_index += 1 q = [] for c in jTree.children(): q.append((c, ptree, level)) q = list(reversed(q)) hWordsPositions = {} while len(q): node = q.pop() wh_add = node[1] level = node[2] node = node[0] if node.isLeaf(): raise ValueError("Malformed syntactic tree: unexpected leaf") elif node.isPreTerminal(): tag = str(node.value()) w_label = node.children()[0].label() v = w_label.value() word = re.sub(r'[\x80-\xFF]', r"_Unknown_char_", v) wtag = sfp.stem(word, tag) _wrd = wtag.word() if _wrd is None: raise Exception( "wtag returned by stemmer returns NULL word() for word %s and tag %s: %s %s" % (word, tag, str(wtag), str(wtag.__class__))) if wtag.word().lower() in STEM_EXCEPTIONS: stemmed = STEM_EXCEPTIONS[wtag.word().lower()] else: stemmed = wtag.word().lower() w_pos = (int(w_label.beginPosition()), int(w_label.endPosition())) nt = self._constructWordNode(word, CTags.fromString(tag), stemmed, level, w_pos, node, self, tree_index, wh_add) tree_index += 1 hWordsPositions[int(w_label.beginPosition())] = nt wh_add.addChild(nt) else: nt = self._constructPhraseNode( CTags.fromString(str(node.value())), level, node, self, tree_index, wh_add) tree_index += 1 wh_add.addChild(nt) level += 1 chldren = [] for c in node.children(): chldren.append((c, nt, level)) q.extend(reversed(chldren)) wNodes = slist(hWordsPositions[p] for p in sorted(hWordsPositions.keys())) """:type : list[SyntWordNode]""" for i in range(len(wNodes)): wNodes[i].set_position(i) return ptree, wNodes
def _getNextBuffer(self): self._counter -= 1 if self._counter > 0: return slist(range(self._counter)) return slist()
def testStreamList(self): l = lambda: slist((1, 2, 3)) self.assertEqual(l().toList(), [1, 2, 3]) self.assertEqual(l()[-1], 3)
def test_nominal(self): s = SynchronizedBufferedStream((slist(xrange(i)) for i in xrange(1, 4))) self.assertListEqual(s.toList(), [0, 0, 1, 0, 1, 2])
def test_slist_repr_nominal(self): l = [1, 2, 3] s = slist(l) self.assertEquals(repr(s), repr(l))
def test_slist_str_nominal(self): l = [1, 2, 3] s = slist(l) s1 = str(s) self.assertEquals(str(s), str(l))
def test_reversedNominal(self): s = slist([1, 2, 3]) self.assertListEqual(s.reversed().toList(), [3, 2, 1])
def iterativeParse_jTree(self, jTree, sfp): """ Parses the Tree object from JVM into Python VM Returns tuple (tree, dictionary of WordPositions) :type sfp: StanfordParser :type jTree: jpype._jclass.edu.stanford.nlp.trees.LabeledScoredTreeNode :rtype : PhraseNode, slist[SyntWordNode] """ level = 0 tree_index = 0 ptree = self._constructPhraseNode(CTags.fromString(str(jTree.value())), level, jTree, self, tree_index) """:type : CPhraseNode""" level += 1 # only root is level 0 tree_index += 1 q = [] for c in jTree.children(): q.append((c, ptree, level)) q = list(reversed(q)) hWordsPositions = {} while len(q): node = q.pop() wh_add = node[1] level = node[2] node = node[0] if node.isLeaf(): raise ValueError("Malformed syntactic tree: unexpected leaf") elif node.isPreTerminal(): tag = str(node.value()) w_label = node.children()[0].label() v = w_label.value() word = re.sub(r'[\x80-\xFF]', r"_Unknown_char_", v) wtag = sfp.stem(word, tag) _wrd = wtag.word() if _wrd is None: raise Exception("wtag returned by stemmer returns NULL word() for word %s and tag %s: %s %s" % ( word, tag, str(wtag), str(wtag.__class__))) if wtag.word().lower() in STEM_EXCEPTIONS: stemmed = STEM_EXCEPTIONS[wtag.word().lower()] else: stemmed = wtag.word().lower() w_pos = (int(w_label.beginPosition()), int(w_label.endPosition())) nt = self._constructWordNode(word, CTags.fromString(tag), stemmed, level, w_pos, node, self, tree_index, wh_add) tree_index += 1 hWordsPositions[int(w_label.beginPosition())] = nt wh_add.addChild(nt) else: nt = self._constructPhraseNode(CTags.fromString(str(node.value())), level, node, self, tree_index, wh_add) tree_index += 1 wh_add.addChild(nt) level += 1 chldren = [] for c in node.children(): chldren.append((c, nt, level)) q.extend(reversed(chldren)) wNodes = slist(hWordsPositions[p] for p in sorted(hWordsPositions.keys())) """:type : list[SyntWordNode]""" for i in xrange(len(wNodes)): wNodes[i].set_position(i) return ptree, wNodes
def test_flatMap_defaultIdentityFunction(self): l = slist(({1: 2, 3: 4}, {5: 6, 7: 8})) self.assertEquals(l.flatMap().toSet(), set((1, 3, 5, 7)))