Exemple #1
0
def postag_api(version=1):
    """ API to parse text and return POS tagged tokens in a verbose JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except Exception:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True)
        # Amalgamate the result into a single list of sentences
        pa: List[List[TokenDict]] = []
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pa = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                for pg in pgs:
                    pa.extend(pg)
        for sent in pa:
            # Transform the token representation into a
            # nice canonical form for outside consumption
            # err = any("err" in t for t in sent)
            for t in sent:
                canonicalize_token(t)

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pa, stats=stats, register=register)
Exemple #2
0
def postag_api(version=1):
    """ API to parse text and return POS tagged tokens in a verbose JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True)
        # Amalgamate the result into a single list of sentences
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pgs = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                pa = []
                for pg in pgs:
                    pa.extend(pg)
                pgs = pa
        for sent in pgs:
            # Transform the token representation into a
            # nice canonical form for outside consumption
            # err = any("err" in t for t in sent)
            for t in sent:
                canonicalize_token(t)

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
Exemple #3
0
 def _visit_token(self, level, node):
     """ At token node """
     meaning = node.token.match_with_meaning(node.terminal)
     d, _ = TreeUtility._describe_token(
         self._tokens[node.token.index],
         node.terminal,
         None if isinstance(meaning, bool) else meaning,
     )
     # Convert from compact form to external (more verbose and descriptive) form
     canonicalize_token(d)
     self._builder.push_terminal(d)
     return None
Exemple #4
0
 def tag_stream(sentence_stream):
     """ Generator for tag stream from a token stream """
     for sent in sentence_stream:
         if not sent:
             continue
         # For each sentence, start and end with empty strings
         for i in range(n - 1):
             yield ""
         for t in sent:
             tag = None
             # Skip punctuation
             if t.get("k", TOK.WORD) != TOK.PUNCTUATION:
                 canonicalize_token(t)
                 tag = str(IFD_Tagset(t))
                 if tag:
                     self.lemma_cnt[t["x"]][tag] += 1
             if tag:
                 yield tag
         for i in range(n - 1):
             yield ""
Exemple #5
0
 def tag_stream(sentence_stream):
     """ Generator for tag stream from a token stream """
     for sent in sentence_stream:
         if not sent:
             continue
         # For each sentence, start and end with empty strings
         for i in range(n - 1):
             yield ""
         for t in sent:
             tag = None
             # Skip punctuation
             if t.get("k", TOK.WORD) != TOK.PUNCTUATION:
                 canonicalize_token(t)
                 tag = str(IFD_Tagset(t))
                 if tag:
                     self.lemma_cnt[t["x"]][tag] += 1
             if tag:
                 yield tag
         for i in range(n - 1):
             yield ""
Exemple #6
0
 def tag_stream(sentence_stream: Iterable[Iterable[TokenDict]]) -> Iterator[str]:
     """ Generator for tag stream from a token stream """
     for sent in sentence_stream:
         if not sent:
             continue
         # For each sentence, start and end with empty strings
         for _ in range(n - 1):
             yield ""
         for t in sent:
             tag = None
             # Skip punctuation
             if t.get("k", TOK.WORD) != TOK.PUNCTUATION:
                 ct = canonicalize_token(t)
                 tag = str(IFD_Tagset(ct))
                 if tag:
                     self.lemma_cnt[ct.get("x", "")][tag] += 1
             if tag:
                 yield tag
         for _ in range(n - 1):
             yield ""