def postag_api(version=1): """ API to parse text and return POS tagged tokens in a verbose JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except Exception: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True) # Amalgamate the result into a single list of sentences pa: List[List[TokenDict]] = [] if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pa = pgs[0] else: # More than one paragraph: gotta concatenate 'em all for pg in pgs: pa.extend(pg) for sent in pa: # Transform the token representation into a # nice canonical form for outside consumption # err = any("err" in t for t in sent) for t in sent: canonicalize_token(t) # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pa, stats=stats, register=register)
def postag_api(version=1): """ API to parse text and return POS tagged tokens in a verbose JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True) # Amalgamate the result into a single list of sentences if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pgs = pgs[0] else: # More than one paragraph: gotta concatenate 'em all pa = [] for pg in pgs: pa.extend(pg) pgs = pa for sent in pgs: # Transform the token representation into a # nice canonical form for outside consumption # err = any("err" in t for t in sent) for t in sent: canonicalize_token(t) # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
def _visit_token(self, level, node): """ At token node """ meaning = node.token.match_with_meaning(node.terminal) d, _ = TreeUtility._describe_token( self._tokens[node.token.index], node.terminal, None if isinstance(meaning, bool) else meaning, ) # Convert from compact form to external (more verbose and descriptive) form canonicalize_token(d) self._builder.push_terminal(d) return None
def tag_stream(sentence_stream): """ Generator for tag stream from a token stream """ for sent in sentence_stream: if not sent: continue # For each sentence, start and end with empty strings for i in range(n - 1): yield "" for t in sent: tag = None # Skip punctuation if t.get("k", TOK.WORD) != TOK.PUNCTUATION: canonicalize_token(t) tag = str(IFD_Tagset(t)) if tag: self.lemma_cnt[t["x"]][tag] += 1 if tag: yield tag for i in range(n - 1): yield ""
def tag_stream(sentence_stream: Iterable[Iterable[TokenDict]]) -> Iterator[str]: """ Generator for tag stream from a token stream """ for sent in sentence_stream: if not sent: continue # For each sentence, start and end with empty strings for _ in range(n - 1): yield "" for t in sent: tag = None # Skip punctuation if t.get("k", TOK.WORD) != TOK.PUNCTUATION: ct = canonicalize_token(t) tag = str(IFD_Tagset(ct)) if tag: self.lemma_cnt[ct.get("x", "")][tag] += 1 if tag: yield tag for _ in range(n - 1): yield ""