def sentence_to_tree(sentence): """ Given a sentence (as a text), it will transform it to a tree. Args: sentence: text of a sentence Return: sentence tree """ assert isinstance(sentence, basestring) sentence = pos_tag(word_tokenize(sentence)) tree = ParentedTree('S', []) for token in sentence: word, pos = token tree.append(ParentedTree(pos, [word])) return tree
def terms_inference(sentences, terms_trie): """ Given (tokenized and tagged) sentences and a trie of terms, it will infere terms occurences and return list of sentence trees. Args: sentences: shallow-parsed text terms_trie: trie of terms Return: list of shallow parse trees with inferred terms, dictionary of refferences to terms positions """ parsed_sentences = [] terms_positions = defaultdict(list) for sentence in sentences: parsed_sentence = ParentedTree('S', []) token_index = 0 while token_index < len(sentence): term_label, term_length = _longest_matching_term( sentence, token_index, terms_trie) if term_length > 0: # term found term_node = ParentedTree('TERM', []) term = name_to_term(term_label) term_node.term = term terms_positions[term].append(term_node) for token in sentence[token_index:token_index + term_length]: _append_word_token(term_node, token) parsed_sentence.append(term_node) token_index += term_length else: # there is no term starting from current postion token = sentence[token_index] _append_word_token(parsed_sentence, token) token_index += 1 parsed_sentences.append(parsed_sentence) return parsed_sentences, terms_positions