def _process_text(parser, session, text, all_names, xform): """ Low-level utility function to parse text and return the result of a transformation function (xform) for each sentence. Set all_names = True to get a comprehensive name register. Set all_names = False to get a simple name register. Set all_names = None to get no name register. """ t0 = time.time() # Demarcate paragraphs in the input text = Fetcher.mark_paragraphs(text) # Tokenize the result toklist = list(tokenize_and_recognize(text, enclosing_session=session)) t1 = time.time() pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) if all_names is None: register = None else: from query import create_name_register register = create_name_register(toklist, session, all_names=all_names) t2 = time.time() stats["tok_time"] = t1 - t0 stats["parse_time"] = t2 - t1 stats["total_time"] = t2 - t0 return (pgs, stats, register)
def tag_text(session, text): """ Parse plain text and return the parsed paragraphs as lists of sentences where each sentence is a list of tagged tokens """ t0 = time.time() # Demarcate paragraphs in the input text = Fetcher.mark_paragraphs(text) # Tokenize the result toklist = list(tokenize(text, enclosing_session=session)) # Paragraph list, containing sentences, containing tokens pgs = [] t1 = time.time() with Fast_Parser( verbose=False) as bp: # Don't emit diagnostic messages ip = IncrementalParser(bp, toklist, verbose=True) for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): if sent.parse(): # Parsed successfully pgs[-1].append( Article._dump_tokens(sent.tokens, sent.tree, None)) else: # Errror in parse pgs[-1].append( Article._dump_tokens(sent.tokens, None, None, sent.err_index)) t2 = time.time() stats = dict(num_tokens=ip.num_tokens, num_sentences=ip.num_sentences, num_parsed=ip.num_parsed, ambiguity=ip.ambiguity, tok_time=t1 - t0, parse_time=t2 - t1, total_time=t2 - t0) # Add a name register to the result register = create_name_register(toklist, session) return (pgs, stats, register)
def analyze(): """ Analyze text manually entered by the user, i.e. not coming from an article """ text = request.form.get("text", "").strip()[0:_MAX_TEXT_LENGTH] with SessionContext(commit=True) as session: # Demarcate paragraphs in the input text = Fetcher.mark_paragraphs(text) # Tokenize the result toklist = list(tokenize(text, enclosing_session=session)) # Paragraph list, containing sentences, containing tokens pgs = [] with Fast_Parser( verbose=False) as bp: # Don't emit diagnostic messages ip = IncrementalParser(bp, toklist, verbose=True) for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): if sent.parse(): # Parsed successfully pgs[-1].append( ArticleProxy._dump_tokens(sent.tokens, sent.tree, None)) else: # Errror in parse pgs[-1].append( ArticleProxy._dump_tokens(sent.tokens, None, None, sent.err_index)) stats = dict(num_tokens=ip.num_tokens, num_sentences=ip.num_sentences, num_parsed=ip.num_parsed, ambiguity=ip.ambiguity) # Add a name register to the result register = create_name_register(toklist, session) # Return the tokens as a JSON structure to the client return jsonify(result=pgs, stats=stats, register=register)